diff --git a/.gitattributes b/.gitattributes index 1ff0c423..4c6690d0 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2,62 +2,4 @@ # Set default behavior to automatically normalize line endings. ############################################################################### * text=auto - -############################################################################### -# Set default behavior for command prompt diff. -# -# This is need for earlier builds of msysgit that does not have it on by -# default for csharp files. -# Note: This is only used by command line -############################################################################### -#*.cs diff=csharp - -############################################################################### -# Set the merge driver for project and solution files -# -# Merging from the command prompt will add diff markers to the files if there -# are conflicts (Merging from VS is not affected by the settings below, in VS -# the diff markers are never inserted). Diff markers may cause the following -# file extensions to fail to load in VS. An alternative would be to treat -# these files as binary and thus will always conflict and require user -# intervention with every merge. To do so, just uncomment the entries below -############################################################################### -#*.sln merge=binary -#*.csproj merge=binary -#*.vbproj merge=binary -#*.vcxproj merge=binary -#*.vcproj merge=binary -#*.dbproj merge=binary -#*.fsproj merge=binary -#*.lsproj merge=binary -#*.wixproj merge=binary -#*.modelproj merge=binary -#*.sqlproj merge=binary -#*.wwaproj merge=binary - -############################################################################### -# behavior for image files -# -# image files are treated as binary by default. -############################################################################### -#*.jpg binary -#*.png binary -#*.gif binary - -############################################################################### -# diff behavior for common document formats -# -# Convert binary document formats to text before diffing them. This feature -# is only available from the command line. Turn it on by uncommenting the -# entries below. -############################################################################### -#*.doc diff=astextplain -#*.DOC diff=astextplain -#*.docx diff=astextplain -#*.DOCX diff=astextplain -#*.dot diff=astextplain -#*.DOT diff=astextplain -#*.pdf diff=astextplain -#*.PDF diff=astextplain -#*.rtf diff=astextplain -#*.RTF diff=astextplain +*.bat text eol=crlf \ No newline at end of file diff --git a/TestCharacterRanges.cs b/TestCharacterRanges.cs new file mode 100644 index 00000000..c302d75b --- /dev/null +++ b/TestCharacterRanges.cs @@ -0,0 +1,58 @@ +using System; +using WFInfo.LanguageProcessing; +using WFInfo.Settings; + +namespace WFInfo.Test +{ + /// + /// Simple test to verify character range generation works correctly + /// + public class TestCharacterRanges + { + public static void TestCharacterRanges() + { + Console.WriteLine("Testing character range generation..."); + + // Create a mock settings object + var settings = new TestApplicationSettings(); + + try + { + // Test Japanese processor + var japaneseProcessor = new JapaneseLanguageProcessor(settings); + var japaneseWhitelist = japaneseProcessor.CharacterWhitelist; + Console.WriteLine($"Japanese whitelist length: {japaneseWhitelist.Length}"); + + // Test Korean processor + var koreanProcessor = new KoreanLanguageProcessor(settings); + var koreanWhitelist = koreanProcessor.CharacterWhitelist; + Console.WriteLine($"Korean whitelist length: {koreanWhitelist.Length}"); + + // Test Chinese processors + var simplifiedProcessor = new SimplifiedChineseLanguageProcessor(settings); + var simplifiedWhitelist = simplifiedProcessor.CharacterWhitelist; + Console.WriteLine($"Simplified Chinese whitelist length: {simplifiedWhitelist.Length}"); + + var traditionalProcessor = new TraditionalChineseLanguageProcessor(settings); + var traditionalWhitelist = traditionalProcessor.CharacterWhitelist; + Console.WriteLine($"Traditional Chinese whitelist length: {traditionalWhitelist.Length}"); + + Console.WriteLine("All character range tests passed!"); + } + catch (Exception ex) + { + Console.WriteLine($"Error testing character ranges: {ex.Message}"); + throw; + } + } + } + + /// + /// Mock application settings for testing + /// + public class TestApplicationSettings : IReadOnlyApplicationSettings + { + public string Locale => "en"; + // Add other required properties as needed + } +} diff --git a/WFInfo/CustomEntrypoint.cs b/WFInfo/CustomEntrypoint.cs index d70cc31f..20dcd80c 100644 --- a/WFInfo/CustomEntrypoint.cs +++ b/WFInfo/CustomEntrypoint.cs @@ -14,6 +14,7 @@ using System.Linq; using System.CodeDom; using Tesseract; +using WFInfo.Tests; namespace WFInfo { @@ -83,6 +84,41 @@ public static void Main() Directory.CreateDirectory(appPath); + // Check for test execution arguments + // Usage: WFInfo.exe [--test] map.json [output.json] + string[] args = Environment.GetCommandLineArgs().Skip(1).ToArray(); + bool isTestMode = false; + + if (args.Length >= 1 && (args[0].Equals("--test", StringComparison.OrdinalIgnoreCase) || + args[0].Equals("-test", StringComparison.OrdinalIgnoreCase) || + args[0].Equals("--map", StringComparison.OrdinalIgnoreCase))) + { + isTestMode = true; + args = args.Skip(1).ToArray(); // strip flag + } + else if (args.Length >= 1 && args[0].EndsWith(".json", StringComparison.OrdinalIgnoreCase)) + { + isTestMode = true; + } + + if (isTestMode) + { + try + { + Console.WriteLine("WFInfo OCR Test Runner"); + Console.WriteLine("======================="); + TestProgram.RunTests(args).GetAwaiter().GetResult(); + return; + } + catch (Exception ex) + { + Console.WriteLine($"Test execution failed: {ex.Message}"); + Console.WriteLine(ex.StackTrace); + Environment.Exit(1); + return; + } + } + string thisprocessname = Process.GetCurrentProcess().ProcessName; string version = Assembly.GetExecutingAssembly().GetName().Version.ToString(); if (Process.GetProcesses().Count(p => p.ProcessName == thisprocessname) > 1) diff --git a/WFInfo/Data.cs b/WFInfo/Data.cs index 1f0b7b4d..2e653365 100644 --- a/WFInfo/Data.cs +++ b/WFInfo/Data.cs @@ -18,6 +18,7 @@ using WFInfo.Services.WarframeProcess; using WFInfo.Services.WindowInfo; using WFInfo.Settings; +using WFInfo.LanguageProcessing; namespace WFInfo { @@ -30,28 +31,6 @@ class Data public JObject equipmentData; // Contains equipmentData from Warframe PC Drops {: {"vaulted": true, "PARTS": {:{"relic_name":|"","count":}, ...}}, ...} public JObject nameData; // Contains relic to market name translation {: } - private static readonly List>> korean = new List>>() { - new Dictionary>() { - { 0, new List{ 6, 7, 8, 16 } }, // ㅁ, ㅂ, ㅃ, ㅍ - { 1, new List{ 2, 3, 4, 16, 5, 9, 10 } }, // ㄴ, ㄷ, ㄸ, ㅌ, ㄹ, ㅅ, ㅆ - { 2, new List{ 12, 13, 14 } }, // ㅈ, ㅉ, ㅊ - { 3, new List{ 0, 1, 15, 11, 18 } } // ㄱ, ㄲ, ㅋ, ㅇ, ㅎ - }, - new Dictionary>() { - { 0, new List{ 20, 5, 1, 7, 3, 19 } }, // ㅣ, ㅔ, ㅐ, ㅖ, ㅒ, ㅢ - { 1, new List{ 16, 11, 15, 10 } }, // ㅟ, ㅚ, ㅞ, ㅙ - { 2, new List{ 4, 0, 6, 2, 14, 9 } }, // ㅓ, ㅏ, ㅕ, ㅑ, ㅝ, ㅘ - { 3, new List{ 18, 13, 8, 17, 12 } } // ㅡ, ㅜ, ㅗ, ㅠ, ㅛ - }, - new Dictionary>() { - { 0, new List{ 16, 17, 18, 26 } }, // ㅁ, ㅂ, ㅄ, ㅍ - { 1, new List{ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 19, 20, 25 } }, // ㄴ, ㄵ, ㄶ, ㄷ, ㄹ, ㄺ, ㄻ, ㄼ, ㄽ, ㄾ, ㄿ, ㅀ, ㅅ, ㅆ, ㅌ - { 2, new List{ 22, 23 } }, // ㅈ, ㅊ - { 3, new List{ 1, 2, 3, 24, 21, 27 } }, // ㄱ, ㄲ, ㄳ, ㅋ, ㅑ, ㅎ - { 4, new List{ 0 } }, // - } - }; - private readonly string applicationDirectory = Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData) + @"\WFInfo"; private readonly string marketItemsPath; private readonly string marketDataPath; @@ -60,7 +39,6 @@ class Data private readonly string nameDataPath; private readonly string filterAllJsonFallbackPath; private readonly string sheetJsonFallbackPath; - private readonly Dictionary wfmItemsFallbackPaths; public string JWT; // JWT is the security key, store this as email+pw combo' private ClientWebSocket marketSocket = new ClientWebSocket(); private CancellationTokenSource marketSocketCancellation = new CancellationTokenSource(); @@ -109,6 +87,9 @@ public Data(IReadOnlyApplicationSettings settings, IProcessFinder process, IWind _process = process; _window = window; + // Initialize the language processor factory + LanguageProcessorFactory.Initialize(settings); + Main.AddLog("Initializing Databases"); marketItemsPath = applicationDirectory + @"\market_items.json"; marketDataPath = applicationDirectory + @"\market_data.json"; @@ -117,12 +98,7 @@ public Data(IReadOnlyApplicationSettings settings, IProcessFinder process, IWind nameDataPath = applicationDirectory + @"\name_data.json"; filterAllJsonFallbackPath = applicationDirectory + @"\fallback_equipment_list.json"; sheetJsonFallbackPath = applicationDirectory + @"\fallback_price_sheet.json"; - wfmItemsFallbackPaths = new Dictionary(); - string[] locales = new string[] { "en", "ko" }; - foreach (string locale in locales) - { - wfmItemsFallbackPaths[locale] = applicationDirectory + @"\fallback_names_" + locale + ".json"; - } + // wfmItemsFallbackPath will be computed per-request in GetWfmItemList Directory.CreateDirectory(applicationDirectory); @@ -229,17 +205,42 @@ public async Task ReloadItems() items = JArray.FromObject(localizedItems.Data["data"]); foreach (var item in items) { - string name = item["slug"].ToString(); - if (name.Contains("prime") && tempMarketItems.ContainsKey(item["id"].ToString())) - tempMarketItems[item["id"].ToString()] = tempMarketItems[item["id"].ToString()] + "|" + item["i18n"][_settings.Locale]["name"]; + string itemId = item["id"].ToString(); + if (tempMarketItems.ContainsKey(itemId)) + { + // Validate presence of locale data and throw exception if missing + if (item["i18n"] == null) + { + throw new KeyNotFoundException($"Item {itemId} missing i18n data entirely"); + } + + if (item["i18n"][_settings.Locale] == null) + { + throw new KeyNotFoundException($"Item {itemId} missing locale data for {_settings.Locale}"); + } + + if (item["i18n"][_settings.Locale]["name"] == null) + { + throw new KeyNotFoundException($"Item {itemId} missing name field for locale {_settings.Locale}"); + } + + string localizedName = item["i18n"][_settings.Locale]["name"].ToString(); + tempMarketItems[itemId] = tempMarketItems[itemId] + "|" + localizedName; + } } + // Add locale metadata for cache validation + tempMarketItems["locale"] = _settings.Locale; + // Atomically replace marketItems under lock lock (marketItemsLock) { marketItems = tempMarketItems; } + // Save only the updated marketItems to file + SaveDatabase(marketItemsPath, marketItems); + Main.AddLog("Item database has been downloaded"); return enItems.IsFallback || localizedItems.IsFallback; } @@ -425,6 +426,9 @@ private async Task LoadMarketItem(string url) private async Task<(JObject Data, bool IsFallback)> GetWfmItemList(string locale) { + // Compute locale-specific fallback path per-request + string localeSpecificFallbackPath = Path.Combine(applicationDirectory, $"fallback_names.{locale}.json"); + try { using (var request = new HttpRequestMessage() @@ -440,30 +444,33 @@ private async Task LoadMarketItem(string url) var response = await client.SendAsync(request).ConfigureAwait(false); var body = await response.Content.ReadAsStringAsync().ConfigureAwait(false); var data = JsonConvert.DeserializeObject(body); - if (wfmItemsFallbackPaths.TryGetValue(locale, out var fallbackPath)) + + // Validate payload structure before caching + if (data != null && data["data"] != null && data["data"] is JArray) { - File.WriteAllText(fallbackPath, body); + File.WriteAllText(localeSpecificFallbackPath, body); + return (data, false); + } + else + { + Main.AddLog($"Invalid payload structure received from {wfmItemsUrl}, using fallback file {localeSpecificFallbackPath}"); + throw new InvalidDataException($"Invalid JSON payload structure from {wfmItemsUrl}"); } - return (data, false); } } catch (Exception ex) { - if (wfmItemsFallbackPaths.TryGetValue(locale, out var fallbackPath)) + Main.AddLog("Failed to fetch/parse " + wfmItemsUrl + ", using file " + localeSpecificFallbackPath + Environment.NewLine + ex.ToString()); + if (File.Exists(localeSpecificFallbackPath)) { - Main.AddLog("Failed to fetch/parse " + wfmItemsUrl + ", using file " + fallbackPath + Environment.NewLine + ex.ToString()); - if (File.Exists(fallbackPath)) - { - string response = File.ReadAllText(fallbackPath); - JObject data = JsonConvert.DeserializeObject(response); - return (data, true); - } + string response = File.ReadAllText(localeSpecificFallbackPath); + JObject data = JsonConvert.DeserializeObject(response); + return (data, true); } else { - Main.AddLog("Failed to fetch/parse " + wfmItemsUrl + ", and no fallback path found for locale: " + locale + Environment.NewLine + ex.ToString()); + throw new AggregateException("No local fallback found", ex); } - throw new AggregateException("No local fallback found", ex); } } @@ -576,7 +583,16 @@ private JObject ParseFileOrMakeNew(string path, ref bool parseHasFailed) { if (File.Exists(path)) { - return JsonConvert.DeserializeObject(File.ReadAllText(path)); + try + { + return JsonConvert.DeserializeObject(File.ReadAllText(path)); + } + catch (Exception ex) + { + Main.AddLog($"Failed to parse {path}: {ex.Message}"); + parseHasFailed = true; + return null; + } } Main.AddLog(path + " missing, loading blank"); parseHasFailed = true; @@ -595,25 +611,45 @@ public async Task UpdateInner(bool force) if (marketData == null) { marketData = ParseFileOrMakeNew(marketDataPath, ref parseHasFailed); + if (marketData == null) + { + throw new InvalidDataException($"Failed to parse marketData from '{marketDataPath}'. JSON deserialization returned null."); + } } lock (marketItemsLock) { if (marketItems == null) { marketItems = ParseFileOrMakeNew(marketItemsPath, ref parseHasFailed); + if (marketItems == null) + { + throw new InvalidDataException($"Failed to parse marketItems from '{marketItemsPath}'. JSON deserialization returned null."); + } } } if (equipmentData == null) { equipmentData = ParseFileOrMakeNew(equipmentDataPath, ref parseHasFailed); + if (equipmentData == null) + { + throw new InvalidDataException($"Failed to parse equipmentData from '{equipmentDataPath}'. JSON deserialization returned null."); + } } if (relicData == null) { relicData = ParseFileOrMakeNew(relicDataPath, ref parseHasFailed); + if (relicData == null) + { + throw new InvalidDataException($"Failed to parse relicData from '{relicDataPath}'. JSON deserialization returned null."); + } } if (nameData == null) { nameData = ParseFileOrMakeNew(nameDataPath, ref parseHasFailed); + if (nameData == null) + { + throw new InvalidDataException($"Failed to parse nameData from '{nameDataPath}'. JSON deserialization returned null."); + } } string oldMarketTimeText; @@ -829,185 +865,44 @@ public int GetDifference(char c1, char c2) public int LevenshteinDistance(string s, string t) { - switch (_settings.Locale) - { - case "ko": - // for korean - return LevenshteinDistanceKorean(s, t); - default: - return LevenshteinDistanceDefault(s, t); - } - } - - public static int LevenshteinDistanceDefault(string s, string t) - { - // Levenshtein Distance determines how many character changes it takes to form a known result - // For example: Nuvo Prime is closer to Nova Prime (2) then Ash Prime (4) - // For more info see: https://en.wikipedia.org/wiki/Levenshtein_distance - s = s.ToLower(Main.culture); - t = t.ToLower(Main.culture); - int n = s.Length; - int m = t.Length; - int[,] d = new int[n + 1, m + 1]; - - if (n == 0 || m == 0) - return n + m; - - d[0, 0] = 0; - - int count = 0; - for (int i = 1; i <= n; i++) - d[i, 0] = (s[i - 1] == ' ' ? count : ++count); - - count = 0; - for (int j = 1; j <= m; j++) - d[0, j] = (t[j - 1] == ' ' ? count : ++count); - - for (int i = 1; i <= n; i++) - for (int j = 1; j <= m; j++) - { - // deletion of s - int opt1 = d[i - 1, j]; - if (s[i - 1] != ' ') - opt1++; - - // deletion of t - int opt2 = d[i, j - 1]; - if (t[j - 1] != ' ') - opt2++; - - // swapping s to t - int opt3 = d[i - 1, j - 1]; - if (t[j - 1] != s[i - 1]) - opt3++; - d[i, j] = Math.Min(Math.Min(opt1, opt2), opt3); - } - - - - return d[n, m]; - } - - // This isn't used anymore?! - public static bool IsKorean(String str) - { - // Safeguard for empty strings that will give false positives and/or crashes - if (string.IsNullOrEmpty(str)) return false; - char c = str[0]; - if (0x1100 <= c && c <= 0x11FF) return true; - if (0x3130 <= c && c <= 0x318F) return true; - if (0xAC00 <= c && c <= 0xD7A3) return true; - return false; + var processor = LanguageProcessorFactory.GetCurrentProcessor(); + return processor.CalculateLevenshteinDistance(s, t); } public string GetLocaleNameData(string s) { - string localeName = ""; - - lock (marketItemsLock) - { - if (marketItems != null) // Add null check - { - foreach (var marketItem in marketItems) - { - if (marketItem.Key == "version") - continue; - string[] split = marketItem.Value.ToString().Split('|'); - if (split[0] == s) - { - localeName = split.Length > 2 ? split[2] : ""; - break; - } - } - } - } - - return localeName; + return GetLocaleNameData(s, true); } - private protected static string e = "A?s/,;j_> group, int ak, int bk) + /// + /// Resolves OCR-specific ambiguities between similar-looking operator names + /// + /// Current best match + /// Candidate alternative + /// Original OCR text for disambiguation + /// True if the candidate should be preferred over current + private bool ResolveOcrAmbiguity(string currentBest, string candidate, string ocrText) { - foreach (var entry in group) - { - if (entry.Value.Contains(ak) && entry.Value.Contains(bk)) - { - return true; - } - } + // Handle Gara/Ivara OCR confusion - these operators have similar visual patterns + if (currentBest.StartsWith("Gara") && candidate.StartsWith("Ivara")) + return true; + + // Handle Gara/Mesa OCR confusion - garbled "Mesa" (e.g. "Mggga") can tie with "Gara" at same Levenshtein distance + // Use first character of OCR text to disambiguate since M and G are visually distinct + if (currentBest.StartsWith("Gara") && candidate.StartsWith("Mesa") && + !string.IsNullOrEmpty(ocrText) && ocrText.StartsWith("M", StringComparison.OrdinalIgnoreCase)) + return true; + + // Future OCR ambiguities can be added here return false; } @@ -1095,30 +990,115 @@ public string GetPartName(string name, out int low, bool suppressLogging, out bo string lowest_unfiltered = null; low = 9999; multipleLowest = false; - foreach (KeyValuePair prop in nameData) - { - int val = LevenshteinDistance(prop.Key, name); - if (val < low) - { - low = val; - lowest = prop.Value.ToObject(); - lowest_unfiltered = prop.Key; - multipleLowest = false; + + // For all non-English supported languages - check against localized names directly to avoid expensive conversion + if (_settings.Locale != "en") + { + // Check against localized names in marketItems + List> marketItemsSnapshot; + var processor = LanguageProcessorFactory.GetCurrentProcessor(); + string normalizedName = processor.NormalizeForPatternMatching(name); + + // Snapshot minimal data needed under lock + lock (marketItemsLock) + { + if (marketItems != null) + { + // Check if cached locale matches current locale + string cachedLocale = marketItems.TryGetValue("locale", out var localeToken) ? localeToken?.ToString() : null; + bool useLocalizedNames = cachedLocale == _settings.Locale; + + marketItemsSnapshot = new List>(); + + foreach (var marketItem in marketItems) + { + if (marketItem.Key == "version") continue; + string[] split = marketItem.Value.ToString().Split('|'); + if (split.Length < 2) continue; + + // Use English name (split[0]) for length comparison regardless of locale cache + int englishNameLength = split[0].Length; + int lengthDiff = Math.Abs((useLocalizedNames && split.Length >= 3 ? split[2].Length : split[0].Length) - name.Length); + if (lengthDiff > Math.Max(englishNameLength, name.Length) / 2) continue; + + // Use localized name only if cache locale matches and available, otherwise fall back to English + string comparisonName = useLocalizedNames && split.Length >= 3 ? split[2] : split[0]; + marketItemsSnapshot.Add(Tuple.Create(split[0], comparisonName, processor.NormalizeForPatternMatching(comparisonName))); + } + } + else + { + marketItemsSnapshot = new List>(); + } } - else if (val == low) - { - multipleLowest = true; + + // Do heavy Levenshtein work outside lock + foreach (var item in marketItemsSnapshot) + { + string englishName = item.Item1; + string storedName = item.Item2; + string normalizedStored = item.Item3; + + int val = processor.CalculateLevenshteinDistance(normalizedName, normalizedStored); + + // Distance filter: Only accept matches with distance < 50% of string length (like GetLocalizedNameData) + if (val >= storedName.Length * 0.5) continue; + + if (val < low) + { + low = val; + lowest = englishName; // Return English name + lowest_unfiltered = storedName; // Show localized name in log + multipleLowest = false; + } + else if (val == low) + { + multipleLowest = true; + } } - - if (val == low && lowest.StartsWith("Gara") && prop.Key.StartsWith("Ivara")) //If both + } + else + { + // Original logic for English + // For English, resolvedName is just the original OCR text + string resolvedName = name; + + foreach (KeyValuePair prop in nameData) { - lowest = prop.Value.ToObject(); - lowest_unfiltered = prop.Key; + int lengthDiff = Math.Abs(prop.Key.Length - name.Length); + if (lengthDiff > Math.Max(prop.Key.Length, name.Length) / 2) continue; // Skip if too different in length + + // Resolve OCR text to English for proper comparison (without recursive Levenshtein calls) + int val = LevenshteinDistance(prop.Key, resolvedName); + + // Distance filter: Only accept matches with distance < 50% of string length + if (val >= prop.Key.Length * 0.5) continue; + + if (val < low) + { + low = val; + lowest = prop.Value.ToObject(); + lowest_unfiltered = prop.Key; + multipleLowest = false; + } + else if (val == low) + { + multipleLowest = true; + } + + // Handle OCR ambiguity between Gara and Ivara operators + // These operators have similar visual patterns that can confuse OCR + if (val == low && ResolveOcrAmbiguity(lowest, prop.Key, resolvedName)) + { + lowest = prop.Value.ToObject(); + lowest_unfiltered = prop.Key; + } } } if (!suppressLogging) Main.AddLog("Found part(" + low + "): \"" + lowest_unfiltered + "\" from \"" + name + "\""); + return lowest; } @@ -1127,11 +1107,24 @@ public string GetPartNameHuman(string name, out int low) string lowest = null; string lowest_unfiltered = null; low = 9999; + + // Resolve OCR text to English once before loops to avoid repeated expensive database searches + // Only resolve for non-English locales to avoid regression in English + string resolvedName; + if (_settings.Locale == "en") + { + resolvedName = name; // Use original OCR text for English + } + else + { + resolvedName = GetLocaleNameData(name, false) ?? name; // Fallback to original OCR string if resolution fails + } + foreach (KeyValuePair prop in nameData) { if (prop.Value.ToString().ToLower(Main.culture).Contains(name.ToLower(Main.culture))) { - int val = LevenshteinDistance(prop.Value.ToString(), name); + int val = LevenshteinDistance(prop.Value.ToString(), resolvedName); if (val < low) { low = val; @@ -1144,7 +1137,7 @@ public string GetPartNameHuman(string name, out int low) { foreach (KeyValuePair prop in nameData) { - int val = LevenshteinDistance(prop.Value.ToString(), name); + int val = LevenshteinDistance(prop.Value.ToString(), resolvedName); if (val < low) { low = val; @@ -1192,7 +1185,7 @@ public static string GetSetName(string name) result = result.Replace("hilt", ""); result = result.Replace("link", ""); result = result.TrimEnd(); - result = Main.culture.TextInfo.ToTitleCase(result); + result = LanguageProcessorFactory.GetCurrentProcessor().Culture.TextInfo.ToTitleCase(result); result += " Set"; return result; } @@ -1460,7 +1453,7 @@ public static void SetUserAgent(ClientWebSocketOptions options, string userAgent options.SetRequestHeader("User-Agent", userAgent); return; } - catch (System.ArgumentException ex) + catch (System.ArgumentException) { //Debug.WriteLine(ex.ToString()); // Fallback to reflection if User-Agent is not settable diff --git a/WFInfo/LanguageProcessing/ChineseLanguageProcessor.cs b/WFInfo/LanguageProcessing/ChineseLanguageProcessor.cs new file mode 100644 index 00000000..51eb169c --- /dev/null +++ b/WFInfo/LanguageProcessing/ChineseLanguageProcessor.cs @@ -0,0 +1,146 @@ +using System; +using System.Text.RegularExpressions; +using WFInfo.Settings; + +namespace WFInfo.LanguageProcessing +{ + /// + /// Base class for Chinese language processors containing shared behaviors + /// + public abstract class ChineseLanguageProcessorBase : LanguageProcessor + { + protected ChineseLanguageProcessorBase(IReadOnlyApplicationSettings settings) : base(settings) + { + } + + public override string CharacterWhitelist => + string.Concat(GenerateCharacterRangeIterator(0x4E00, 0x7FFF)) + + string.Concat(GenerateCharacterRangeIterator(0x8000, 0x9FFF)) + + GenerateCharacterRange(0x3400, 0x4DBF) + + GenerateCharacterRange(0xF900, 0xFAFF) + + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz "; // Full CJK ideographs + + public override string NormalizeForPatternMatching(string input) + { + if (string.IsNullOrEmpty(input)) return input; + + // Basic cleanup for Chinese + string normalized = input.ToLower(_culture).Trim(); + + // Add spaces around "Prime" to match database format better + normalized = normalized.Replace("prime", " prime "); + + // Remove accents (not typically needed for Chinese) + normalized = RemoveAccents(normalized); + + // Remove extra spaces + var parts = normalized.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); + return string.Join(" ", parts); + } + + public override bool IsPartNameValid(string partName) + { + // Chinese requires minimum of 4 characters after removing spaces + return !string.IsNullOrEmpty(partName) && partName.Replace(" ", "").Length >= 4; + } + + public override bool ShouldFilterWord(string word) + { + return FilterWordCore(word); + } + + /// + /// Shared filtering logic for Chinese word processing + /// + public static bool FilterWordCore(string word) + { + if (string.IsNullOrEmpty(word)) return true; + + bool hasCJK = ContainsCJK(word); + bool hasLatin = false; + foreach (char c in word) + { + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) + { + hasLatin = true; + break; + } + } + + // Pure CJK words: keep (even single chars are meaningful in Chinese) + if (hasCJK && !hasLatin) return false; + + // Pure Latin words: shortest valid item name component is 3 chars (Ash, Nyx, Mag) + // Filter Latin-only words with <= 2 chars ("ll", "ee", "on", "me" = OCR noise from UI) + if (hasLatin && !hasCJK) return word.Length <= 2; + + // Mixed Latin+CJK: filter short mixed words (like "G壬") which are OCR garbage + // Valid mixed text is always longer (e.g. "Prime" next to CJK is separate words) + if (hasCJK && hasLatin && word.Length <= 2) return true; + + // Keep everything else + return false; + } + + /// + /// Checks if a string contains CJK characters + /// + public static bool ContainsCJK(string text) + { + foreach (char c in text) + { + if ((c >= 0x4E00 && c <= 0x9FFF) || (c >= 0x3400 && c <= 0x4DBF) || (c >= 0xF900 && c <= 0xFAFF)) + return true; + } + return false; + } + + /// + /// Normalizes Chinese characters for comparison + /// + protected static string NormalizeChineseCharacters(string input) + { + return NormalizeFullWidthCharacters(input).ToLowerInvariant(); + } + } + + /// + /// Simplified Chinese language processor for OCR text processing + /// Handles Simplified Chinese characters + /// + public class SimplifiedChineseLanguageProcessor : ChineseLanguageProcessorBase + { + public SimplifiedChineseLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings) + { + } + + public override string Locale => "zh-hans"; + + public override string[] BlueprintRemovals => new[] { "蓝图", "设计图" }; + + public override int CalculateLevenshteinDistance(string s, string t) + { + return LevenshteinDistanceWithPreprocessing(s, t, BlueprintRemovals, NormalizeChineseCharacters, callBaseDefault: true); + } + } + + /// + /// Traditional Chinese language processor for OCR text processing + /// Handles Traditional Chinese characters + /// + public class TraditionalChineseLanguageProcessor : ChineseLanguageProcessorBase + { + public TraditionalChineseLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings) + { + } + + public override string Locale => "zh-hant"; + + public override string[] BlueprintRemovals => new[] { "藍圖", "設計圖" }; + + public override int CalculateLevenshteinDistance(string s, string t) + { + return LevenshteinDistanceWithPreprocessing(s, t, BlueprintRemovals, NormalizeChineseCharacters, callBaseDefault: true); + } + } +} diff --git a/WFInfo/LanguageProcessing/CyrillicLanguageProcessor.cs b/WFInfo/LanguageProcessing/CyrillicLanguageProcessor.cs new file mode 100644 index 00000000..72725be4 --- /dev/null +++ b/WFInfo/LanguageProcessing/CyrillicLanguageProcessor.cs @@ -0,0 +1,111 @@ +using System; +using System.Text.RegularExpressions; +using WFInfo.Settings; + +namespace WFInfo.LanguageProcessing +{ + /// + /// Russian language processor for OCR text processing + /// Handles Russian Cyrillic characters with Latin transliteration + /// + public class RussianLanguageProcessor : LanguageProcessor + { + public RussianLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings) + { + } + + public override string Locale => "ru"; + + public override string[] BlueprintRemovals => new string[0]; // No blueprint removals - handled in NormalizeForPatternMatching + + public override string CharacterWhitelist => GenerateCharacterRange(0x0400, 0x04FF) + GenerateCharacterRange(0x0500, 0x052F) + ": "; // Cyrillic + Cyrillic Supplement + + public override int CalculateLevenshteinDistance(string s, string t) + { + // For Russian, don't normalize Cyrillic to Latin - we want to match Russian to Russian + return LevenshteinDistanceWithPreprocessing(s, t, BlueprintRemovals, null); + } + + public override string NormalizeForPatternMatching(string input) + { + if (string.IsNullOrEmpty(input)) return input; + + // Basic cleanup for Russian + string normalized = input.ToLower(_culture).Trim(); + + // Handle Russian blueprint format: "Чертёж: " -> " (чертеж)" + if (normalized.StartsWith("чертёж:") || normalized.StartsWith("чертеж:")) + { + // Extract item name after "чертёж:" / "чертеж:" with optional whitespace + string itemName = Regex.Replace(normalized, @"^черт[её]ж:\s*", ""); + normalized = itemName + " (чертеж)"; + } + + // Remove extra spaces + var parts = normalized.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); + return string.Join(" ", parts); + } + + public override bool IsPartNameValid(string partName) + { + // Russian requires minimum of 6 characters after removing spaces + return !string.IsNullOrEmpty(partName) && partName.Replace(" ", "").Length >= 6; + } + + public override bool ShouldFilterWord(string word) + { + // Russian filters very short words (less than 2 characters) + return !string.IsNullOrEmpty(word) && word.Length < 2; + } + } + + /// + /// Ukrainian language processor for OCR text processing + /// Handles Ukrainian Cyrillic characters with Latin transliteration + /// + public class UkrainianLanguageProcessor : LanguageProcessor + { + public UkrainianLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings) + { + } + + public override string Locale => "uk"; + + public override string[] BlueprintRemovals => new[] { "Кресленник" }; + + public override string CharacterWhitelist => GenerateCharacterRange(0x0400, 0x04FF) + GenerateCharacterRange(0x0500, 0x052F) + ": -()"; // Cyrillic + Cyrillic Supplement + + public override int CalculateLevenshteinDistance(string s, string t) + { + // For Ukrainian, don't normalize Cyrillic to Latin - we want to match Ukrainian to Ukrainian + return LevenshteinDistanceWithPreprocessing(s, t, BlueprintRemovals, null); + } + + public override string NormalizeForPatternMatching(string input) + { + if (string.IsNullOrEmpty(input)) return input; + + // Basic cleanup for Ukrainian + string normalized = input.ToLower(_culture).Trim(); + + // Remove accents (not typically needed for Ukrainian) + //normalized = RemoveAccents(normalized); + + // Remove extra spaces + var parts = normalized.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); + return string.Join(" ", parts); + } + + public override bool IsPartNameValid(string partName) + { + // Ukrainian requires minimum of 6 characters after removing spaces + return !string.IsNullOrEmpty(partName) && partName.Replace(" ", "").Length >= 6; + } + + public override bool ShouldFilterWord(string word) + { + // Ukrainian filters very short words (less than 2 characters) + return !string.IsNullOrEmpty(word) && word.Length < 2; + } + } +} diff --git a/WFInfo/LanguageProcessing/EnglishLanguageProcessor.cs b/WFInfo/LanguageProcessing/EnglishLanguageProcessor.cs new file mode 100644 index 00000000..abd3f07a --- /dev/null +++ b/WFInfo/LanguageProcessing/EnglishLanguageProcessor.cs @@ -0,0 +1,55 @@ +using System; +using System.Text.RegularExpressions; +using WFInfo.Settings; + +namespace WFInfo.LanguageProcessing +{ + /// + /// English language processor for OCR text processing + /// Handles standard English text with basic normalization + /// + public class EnglishLanguageProcessor : LanguageProcessor + { + public EnglishLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings) + { + } + + public override string Locale => "en"; + + public override string[] BlueprintRemovals => new[] { "Blueprint" }; + + public override string CharacterWhitelist => "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + + public override int CalculateLevenshteinDistance(string s, string t) + { + return DefaultLevenshteinDistance(s, t); + } + + public override string NormalizeForPatternMatching(string input) + { + if (string.IsNullOrEmpty(input)) return input; + + // Basic cleanup for English + string normalized = input.ToLower(_culture).Trim(); + + // Add spaces around "Prime" to match database format better + normalized = normalized.Replace("prime", " prime "); + + // Remove extra spaces + var parts = normalized.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); + return string.Join(" ", parts); + } + + public override bool IsPartNameValid(string partName) + { + // English requires minimum length of 13 characters + return !string.IsNullOrEmpty(partName) && partName.Length >= 13; + } + + public override bool ShouldFilterWord(string word) + { + // English filters very short words (less than 2 characters) + return !string.IsNullOrEmpty(word) && word.Length < 2; + } + } +} diff --git a/WFInfo/LanguageProcessing/EuropeanLanguageProcessor.cs b/WFInfo/LanguageProcessing/EuropeanLanguageProcessor.cs new file mode 100644 index 00000000..517bbdd8 --- /dev/null +++ b/WFInfo/LanguageProcessing/EuropeanLanguageProcessor.cs @@ -0,0 +1,244 @@ +using System; +using System.Text.RegularExpressions; +using WFInfo.Settings; + +namespace WFInfo.LanguageProcessing +{ + /// + /// Base class for European language processors with common diacritic handling + /// + public abstract class EuropeanLanguageProcessorBase : LanguageProcessor + { + protected EuropeanLanguageProcessorBase(IReadOnlyApplicationSettings settings) : base(settings) + { + } + + public override string NormalizeForPatternMatching(string input) + { + if (string.IsNullOrEmpty(input)) return input; + + // Basic cleanup for European languages + string normalized = input.ToLower(_culture).Trim(); + + // Add spaces around "Prime" to match database format better + normalized = normalized.Replace("prime", " prime "); + + // Don't remove accents for European languages since database has accented characters + // Remove extra spaces + var parts = normalized.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); + return string.Join(" ", parts); + } + + public override bool IsPartNameValid(string partName) + { + // European languages require minimum of 8 characters + return !string.IsNullOrEmpty(partName) && partName.Length >= 8; + } + + public override bool ShouldFilterWord(string word) + { + // European languages filter very short words (less than 2 characters) + return !string.IsNullOrEmpty(word) && word.Length < 2; + } + + public override int CalculateLevenshteinDistance(string s, string t) + { + return DefaultLevenshteinDistance(s, t); + } + + protected override int DefaultLevenshteinDistance(string s, string t) + { + return LevenshteinDistanceWithPreprocessing(s, t, BlueprintRemovals, input => NormalizeEuropeanCharacters(input), callBaseDefault: true); + } + + /// + /// Normalizes European characters for comparison + /// + protected static string NormalizeEuropeanCharacters(string input) + { + // Convert common European diacritics to standard equivalents for comparison + return input.ToLowerInvariant() + .Replace('à', 'a').Replace('á', 'a').Replace('â', 'a').Replace('ã', 'a').Replace('ä', 'a').Replace('å', 'a') + .Replace('è', 'e').Replace('é', 'e').Replace('ê', 'e').Replace('ë', 'e') + .Replace('ì', 'i').Replace('í', 'i').Replace('î', 'i').Replace('ï', 'i') + .Replace('ò', 'o').Replace('ó', 'o').Replace('ô', 'o').Replace('õ', 'o').Replace('ö', 'o') + .Replace('ù', 'u').Replace('ú', 'u').Replace('û', 'u').Replace('ü', 'u') + .Replace('ñ', 'n') + .Replace('ç', 'c') + .Replace('ÿ', 'y') + .Replace('Ç', 'C') + .Replace('Ÿ', 'Y'); + } + } + + /// + /// German language processor for OCR text processing + /// Handles German characters with umlauts + /// + public class GermanLanguageProcessor : EuropeanLanguageProcessorBase + { + public GermanLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings) + { + } + + public override string Locale => "de"; + + public override string[] BlueprintRemovals => new[] { "Blaupause", "Plan" }; + + public override string CharacterWhitelist => "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz " + GenerateCharacterRange(0x00C4, 0x00C4) + GenerateCharacterRange(0x00D6, 0x00D6) + GenerateCharacterRange(0x00DC, 0x00DC) + GenerateCharacterRange(0x00DF, 0x00DF) + GenerateCharacterRange(0x00E4, 0x00E4) + GenerateCharacterRange(0x00F6, 0x00F6) + GenerateCharacterRange(0x00FC, 0x00FC); // German with umlauts + } + + /// + /// Spanish language processor for OCR text processing + /// Handles Spanish characters with accents and special characters + /// + public class SpanishLanguageProcessor : EuropeanLanguageProcessorBase + { + public SpanishLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings) + { + } + + public override string Locale => "es"; + + public override string[] BlueprintRemovals => new[] { "Plano", "Diseño" }; + + public override string CharacterWhitelist => "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz " + + GenerateCharacterRange(0x00C1, 0x00C1) + // Á + GenerateCharacterRange(0x00C9, 0x00C9) + // É + GenerateCharacterRange(0x00CD, 0x00CD) + // Í + GenerateCharacterRange(0x00D1, 0x00D1) + // Ñ + GenerateCharacterRange(0x00D3, 0x00D3) + // Ó + GenerateCharacterRange(0x00DA, 0x00DA) + // Ú + GenerateCharacterRange(0x00DC, 0x00DC) + // Ü + GenerateCharacterRange(0x00E1, 0x00E1) + // á + GenerateCharacterRange(0x00E9, 0x00E9) + // é + GenerateCharacterRange(0x00ED, 0x00ED) + // í + GenerateCharacterRange(0x00F1, 0x00F1) + // ñ + GenerateCharacterRange(0x00F3, 0x00F3) + // ó + GenerateCharacterRange(0x00FA, 0x00FA) + // ú + GenerateCharacterRange(0x00FC, 0x00FC); // ü + } + + /// + /// Portuguese language processor for OCR text processing + /// Handles Portuguese characters with accents and special characters + /// + public class PortugueseLanguageProcessor : EuropeanLanguageProcessorBase + { + public PortugueseLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings) + { + } + + public override string Locale => "pt"; + + public override string[] BlueprintRemovals => new[] { "Planta", "Projeto" }; + + public override string CharacterWhitelist => "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz " + + GenerateCharacterRange(0x00C0, 0x00C0) + // À + GenerateCharacterRange(0x00C1, 0x00C1) + // Á + GenerateCharacterRange(0x00C2, 0x00C2) + //  + GenerateCharacterRange(0x00C3, 0x00C3) + // à + GenerateCharacterRange(0x00C7, 0x00C7) + // Ç + GenerateCharacterRange(0x00C9, 0x00C9) + // É + GenerateCharacterRange(0x00CA, 0x00CA) + // Ê + GenerateCharacterRange(0x00CD, 0x00CD) + // Í + GenerateCharacterRange(0x00D3, 0x00D3) + // Ó + GenerateCharacterRange(0x00D4, 0x00D4) + // Ô + GenerateCharacterRange(0x00D5, 0x00D5) + // Õ + GenerateCharacterRange(0x00DA, 0x00DA) + // Ú + GenerateCharacterRange(0x00DC, 0x00DC) + // Ü + GenerateCharacterRange(0x00E0, 0x00E0) + // à + GenerateCharacterRange(0x00E1, 0x00E1) + // á + GenerateCharacterRange(0x00E2, 0x00E2) + // â + GenerateCharacterRange(0x00E3, 0x00E3) + // ã + GenerateCharacterRange(0x00E7, 0x00E7) + // ç + GenerateCharacterRange(0x00E9, 0x00E9) + // é + GenerateCharacterRange(0x00EA, 0x00EA) + // ê + GenerateCharacterRange(0x00ED, 0x00ED) + // í + GenerateCharacterRange(0x00F3, 0x00F3) + // ó + GenerateCharacterRange(0x00F4, 0x00F4) + // ô + GenerateCharacterRange(0x00F5, 0x00F5) + // õ + GenerateCharacterRange(0x00FA, 0x00FA) + // ú + GenerateCharacterRange(0x00FC, 0x00FC); // ü + } + + /// + /// French language processor for OCR text processing + /// Handles French characters with accents and special localization logic + /// + public class FrenchLanguageProcessor : EuropeanLanguageProcessorBase + { + public FrenchLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings) + { + } + + public override string Locale => "fr"; + + public override string[] BlueprintRemovals => new[] { "Schéma", "Plan" }; + + public override string CharacterWhitelist => "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz " + + GenerateCharacterRange(0x00C0, 0x00C0) + // À + GenerateCharacterRange(0x00C2, 0x00C2) + //  + GenerateCharacterRange(0x00C6, 0x00C6) + // Æ + GenerateCharacterRange(0x00C7, 0x00C7) + // Ç + GenerateCharacterRange(0x00C8, 0x00C8) + // È + GenerateCharacterRange(0x00C9, 0x00C9) + // É + GenerateCharacterRange(0x00CA, 0x00CA) + // Ê + GenerateCharacterRange(0x00CB, 0x00CB) + // Ë + GenerateCharacterRange(0x00CE, 0x00CE) + // Î + GenerateCharacterRange(0x00CF, 0x00CF) + // Ï + GenerateCharacterRange(0x00D4, 0x00D4) + // Ô + GenerateCharacterRange(0x00D6, 0x00D6) + // Ö + GenerateCharacterRange(0x00D9, 0x00D9) + // Ù + GenerateCharacterRange(0x00DB, 0x00DB) + // Û + GenerateCharacterRange(0x00DC, 0x00DC) + // Ü + GenerateCharacterRange(0x00E0, 0x00E0) + // à + GenerateCharacterRange(0x00E2, 0x00E2) + // â + GenerateCharacterRange(0x00E6, 0x00E6) + // æ + GenerateCharacterRange(0x00E7, 0x00E7) + // ç + GenerateCharacterRange(0x00E8, 0x00E8) + // è + GenerateCharacterRange(0x00E9, 0x00E9) + // é + GenerateCharacterRange(0x00EA, 0x00EA) + // ê + GenerateCharacterRange(0x00EB, 0x00EB) + // ë + GenerateCharacterRange(0x00EE, 0x00EE) + // î + GenerateCharacterRange(0x00EF, 0x00EF) + // ï + GenerateCharacterRange(0x00F4, 0x00F4) + // ô + GenerateCharacterRange(0x00F6, 0x00F6) + // ö + GenerateCharacterRange(0x00F9, 0x00F9) + // ù + GenerateCharacterRange(0x00FB, 0x00FB) + // û + GenerateCharacterRange(0x00FC, 0x00FC); // ü + } + + /// + /// Italian language processor for OCR text processing + /// Handles Italian characters with accents + /// + public class ItalianLanguageProcessor : EuropeanLanguageProcessorBase + { + public ItalianLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings) + { + } + + public override string Locale => "it"; + + public override string[] BlueprintRemovals => new[] { "Progetto", "Piano" }; + + public override string CharacterWhitelist => "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-()" + + GenerateCharacterRange(0x00C0, 0x00C0) + // À + GenerateCharacterRange(0x00C8, 0x00C8) + // È + GenerateCharacterRange(0x00C9, 0x00C9) + // É + GenerateCharacterRange(0x00CC, 0x00CC) + // Ì + GenerateCharacterRange(0x00CD, 0x00CD) + // Í + GenerateCharacterRange(0x00D2, 0x00D2) + // Ò + GenerateCharacterRange(0x00D3, 0x00D3) + // Ó + GenerateCharacterRange(0x00D9, 0x00D9) + // Ù + GenerateCharacterRange(0x00E0, 0x00E0) + // à + GenerateCharacterRange(0x00E8, 0x00E8) + // è + GenerateCharacterRange(0x00E9, 0x00E9) + // é + GenerateCharacterRange(0x00EC, 0x00EC) + // ì + GenerateCharacterRange(0x00ED, 0x00ED) + // í + GenerateCharacterRange(0x00F2, 0x00F2) + // ò + GenerateCharacterRange(0x00F3, 0x00F3) + // ó + GenerateCharacterRange(0x00F9, 0x00F9); // ù + } +} diff --git a/WFInfo/LanguageProcessing/JapaneseLanguageProcessor.cs b/WFInfo/LanguageProcessing/JapaneseLanguageProcessor.cs new file mode 100644 index 00000000..3ac16d48 --- /dev/null +++ b/WFInfo/LanguageProcessing/JapaneseLanguageProcessor.cs @@ -0,0 +1,229 @@ +using System; +using System.Collections.Generic; +using System.Text.RegularExpressions; +using WFInfo.Settings; + +namespace WFInfo.LanguageProcessing +{ + /// + /// Japanese language processor for OCR text processing + /// Handles Japanese Hiragana, Katakana, and Kanji characters + /// + public class JapaneseLanguageProcessor : LanguageProcessor + { + public JapaneseLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings) + { + } + + public override string Locale => "ja"; + + public override string[] BlueprintRemovals => new[] { "設計図", "青図" }; + + public override string CharacterWhitelist => + GenerateCharacterRange(0x3040, 0x309F) + + GenerateCharacterRange(0x30A0, 0x30FF) + + string.Concat(GenerateCharacterRangeIterator(0x4E00, 0x6FFF)) + + GenerateCharacterRange(0x7000, 0x7FFF) + + GenerateCharacterRange(0x8000, 0x9FAF) + + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz "; // Japanese Hiragana, Katakana, Kanji + + public override int CalculateLevenshteinDistance(string s, string t) + { + // Check if both inputs contain Japanese characters for Japanese-aware comparison + bool sHasJapanese = ContainsJapanese(s); + bool tHasJapanese = ContainsJapanese(t); + + if (sHasJapanese && tHasJapanese) + { + // Japanese-aware path: use original Japanese characters with Japanese similarity logic + return CalculateJapaneseAwareDistance(s, t); + } + else + { + // Fallback/transliterated path: normalize to Latin equivalents + return LevenshteinDistanceWithPreprocessing(s, t, BlueprintRemovals, NormalizeJapaneseCharacters, callBaseDefault: true); + } + } + + /// + /// Calculates Japanese-aware Levenshtein distance with character similarity groups + /// + private int CalculateJapaneseAwareDistance(string s, string t) + { + if (string.IsNullOrEmpty(s)) return string.IsNullOrEmpty(t) ? 0 : t.Length; + if (string.IsNullOrEmpty(t)) return s.Length; + + int n = s.Length; + int m = t.Length; + + if (n == 0) return m; + if (m == 0) return n; + + int[,] d = new int[n + 1, m + 1]; + + for (int i = 0; i <= n; i++) + d[i, 0] = i; + + for (int j = 0; j <= m; j++) + d[0, j] = j; + + for (int i = 1; i <= n; i++) + { + for (int j = 1; j <= m; j++) + { + int cost = GetJapaneseCharacterDifference(s[i - 1], t[j - 1]); + d[i, j] = Math.Min( + Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1), + d[i - 1, j - 1] + cost); + } + } + + return d[n, m]; + } + + /// + /// Gets the character difference cost for Japanese characters based on similarity groups + /// + private int GetJapaneseCharacterDifference(char a, char b) + { + if (a == b) return 0; + + // Hiragana-Katakana equivalents (lower cost for similar characters) + var hiraganaKatakanaPairs = new Dictionary + { + {'あ', 'ア'}, {'い', 'イ'}, {'う', 'ウ'}, {'え', 'エ'}, {'お', 'オ'}, + {'か', 'カ'}, {'き', 'キ'}, {'く', 'ク'}, {'け', 'ケ'}, {'こ', 'コ'}, + {'が', 'ガ'}, {'ぎ', 'ギ'}, {'ぐ', 'グ'}, {'げ', 'ゲ'}, {'ご', 'ゴ'}, + {'さ', 'サ'}, {'し', 'シ'}, {'す', 'ス'}, {'せ', 'セ'}, {'そ', 'ソ'}, + {'ざ', 'ザ'}, {'じ', 'ジ'}, {'ず', 'ズ'}, {'ぜ', 'ゼ'}, {'ぞ', 'ゾ'}, + {'た', 'タ'}, {'ち', 'チ'}, {'つ', 'ツ'}, {'て', 'テ'}, {'と', 'ト'}, + {'だ', 'ダ'}, {'ぢ', 'ヂ'}, {'づ', 'ヅ'}, {'で', 'デ'}, {'ど', 'ド'}, + {'な', 'ナ'}, {'に', 'ニ'}, {'ぬ', 'ヌ'}, {'ね', 'ネ'}, {'の', 'ノ'}, + {'は', 'ハ'}, {'ひ', 'ヒ'}, {'ふ', 'フ'}, {'へ', 'ヘ'}, {'ほ', 'ホ'}, + {'ば', 'バ'}, {'び', 'ビ'}, {'ぶ', 'ブ'}, {'べ', 'ベ'}, {'ぼ', 'ボ'}, + {'ぱ', 'パ'}, {'ぴ', 'ピ'}, {'ぷ', 'プ'}, {'ぺ', 'ペ'}, {'ぽ', 'ポ'}, + {'ま', 'マ'}, {'み', 'ミ'}, {'む', 'ム'}, {'め', 'メ'}, {'も', 'モ'}, + {'や', 'ヤ'}, {'ゆ', 'ユ'}, {'よ', 'ヨ'}, + {'ら', 'ラ'}, {'り', 'リ'}, {'る', 'ル'}, {'れ', 'レ'}, {'ろ', 'ロ'}, + {'わ', 'ワ'}, {'ゐ', 'ヰ'}, {'ゑ', 'ヱ'}, {'を', 'ヲ'}, {'ん', 'ン'}, + {'っ', 'ッ'}, {'ゃ', 'ャ'}, {'ゅ', 'ュ'}, {'ょ', 'ョ'} + }; + + // Check if characters are hiragana-katakana equivalents + if (hiraganaKatakanaPairs.TryGetValue(a, out var katakanaEquiv) && katakanaEquiv == b) + return 1; // Low cost for hiragana-katakana equivalents + if (hiraganaKatakanaPairs.TryGetValue(b, out var hiraganaEquiv) && hiraganaEquiv == a) + return 1; + + // Similar looking characters (common OCR confusions) + var similarChars = new[] + { + new[] {'シ', 'ツ'}, // shi/tsu confusion + new[] {'ソ', 'ン'}, // so/n confusion + new[] {'ク', 'ワ'}, // ku/wa confusion + new[] {'ヘ', 'へ'}, // he/he (different forms) + new[] {'ベ', 'べ'}, // be/be (different forms) + new[] {'ヲ', 'ヲ'}, // wo/wo (different forms) + new[] {'ヶ', 'ケ'}, // ke/ke variation + new[] {'ヵ', 'カ'}, // ka/ka variation + }; + + foreach (var pair in similarChars) + { + if ((a == pair[0] && b == pair[1]) || (a == pair[1] && b == pair[0])) + return 1; // Low cost for similar looking characters + } + + // Default cost for different characters + return 2; + } + + public override string NormalizeForPatternMatching(string input) + { + if (string.IsNullOrEmpty(input)) return input; + + // Apply Japanese-specific normalization first + string normalized = NormalizeJapaneseCharacters(input); + + // Basic cleanup for Japanese + normalized = normalized.ToLower(_culture).Trim(); + + // Add spaces around "Prime" to match database format better + normalized = normalized.Replace("prime", " prime "); + + // Remove accents (not typically needed for Japanese - preserve combining marks) + // normalized = RemoveAccents(normalized); + + // Remove extra spaces + var parts = normalized.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); + return string.Join(" ", parts); + } + + public override bool IsPartNameValid(string partName) + { + // Japanese requires minimum of 4 characters after removing spaces + return !string.IsNullOrEmpty(partName) && partName.Replace(" ", "").Length >= 4; + } + + + public override bool ShouldFilterWord(string word) + { + if (string.IsNullOrEmpty(word)) return true; + + bool hasJapanese = ContainsJapanese(word); + bool hasLatin = false; + foreach (char c in word) + { + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) + { + hasLatin = true; + break; + } + } + + // Keep all Japanese text (Hiragana/Katakana/Kanji characters) since Japanese words are meaningful + // even when split by OCR + if (hasJapanese) return false; + + // For mixed Japanese-Latin words, be more lenient + if (hasJapanese && hasLatin) return false; + + // For non-Japanese text, use standard filtering (filter very short words) + return word.Length < 2; + } + + /// + /// Checks if a string contains Japanese characters (Hiragana, Katakana, or Kanji) + /// + private static bool ContainsJapanese(string input) + { + foreach (char c in input) + { + // Hiragana (0x3040-0x309F) + if (c >= 0x3040 && c <= 0x309F) return true; + // Katakana (0x30A0-0x30FF) + if (c >= 0x30A0 && c <= 0x30FF) return true; + // Kanji (0x4E00-0x9FAF) + if (c >= 0x4E00 && c <= 0x9FAF) return true; + } + return false; + } + + /// + /// Normalizes Japanese characters for comparison + /// + private static string NormalizeJapaneseCharacters(string input) + { + string result = NormalizeFullWidthCharacters(input); + + // Normalize katakana/hiragana variations and common OCR confusions + result = result.Replace('ヶ', 'ケ').Replace('ヵ', 'カ'); + result = result.Replace('゙', '゛').Replace('゚', '゜'); // Handakuten and Dakuten normalization + + // Common katakana OCR confusions + result = result.Replace('ヲ', 'ヲ').Replace('ヮ', 'ワ').Replace('ヰ', 'イ').Replace('ヱ', 'エ').Replace('ヲ', 'オ'); + + return result.ToLowerInvariant(); + } + } +} diff --git a/WFInfo/LanguageProcessing/KoreanLanguageProcessor.cs b/WFInfo/LanguageProcessing/KoreanLanguageProcessor.cs new file mode 100644 index 00000000..12a46cb0 --- /dev/null +++ b/WFInfo/LanguageProcessing/KoreanLanguageProcessor.cs @@ -0,0 +1,494 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; +using WFInfo.Settings; + +namespace WFInfo.LanguageProcessing +{ + /// + /// Korean language processor for OCR text processing + /// Handles Korean Hangul characters with special normalization rules + /// + public class KoreanLanguageProcessor : LanguageProcessor + { + + // Static spacing corrections to avoid recreating dictionary on every call + private static readonly Dictionary spacingCorrections = new Dictionary + { + {" ", " "}, {" ", " "}, {" ", " "} + }; + + // Static Korean character replacements to avoid recreating list on every call + private static readonly List> koreanReplacements = new List> + { + // Basic consonants and vowels + new KeyValuePair("가", "ga"), new KeyValuePair("개", "gae"), new KeyValuePair("갸", "gya"), new KeyValuePair("걔", "gyae"), new KeyValuePair("거", "geo"), new KeyValuePair("게", "ge"), new KeyValuePair("겨", "gyeo"), new KeyValuePair("계", "gye"), + new KeyValuePair("고", "go"), new KeyValuePair("과", "gwa"), new KeyValuePair("궈", "gwo"), new KeyValuePair("괘", "gwae"), new KeyValuePair("괴", "goe"), new KeyValuePair("교", "gyo"), new KeyValuePair("구", "gu"), + new KeyValuePair("궤", "gwe"), new KeyValuePair("귀", "gwi"), new KeyValuePair("규", "gyu"), new KeyValuePair("그", "geu"), new KeyValuePair("긔", "gui"), new KeyValuePair("기", "gi"), + + new KeyValuePair("나", "na"), new KeyValuePair("내", "nae"), new KeyValuePair("냐", "nya"), new KeyValuePair("냬", "nyae"), new KeyValuePair("너", "neo"), new KeyValuePair("네", "ne"), new KeyValuePair("녀", "nyeo"), new KeyValuePair("녜", "nye"), + new KeyValuePair("노", "no"), new KeyValuePair("놔", "nwa"), new KeyValuePair("놰", "nwo"), new KeyValuePair("뇌", "noe"), new KeyValuePair("뇨", "nyo"), new KeyValuePair("누", "nu"), new KeyValuePair("뉘", "nwi"), + new KeyValuePair("뉴", "nyu"), new KeyValuePair("느", "neu"), new KeyValuePair("늬", "nui"), new KeyValuePair("니", "ni"), + + new KeyValuePair("다", "da"), new KeyValuePair("대", "dae"), new KeyValuePair("댜", "dya"), new KeyValuePair("댸", "dyae"), new KeyValuePair("더", "deo"), new KeyValuePair("데", "de"), new KeyValuePair("뎌", "dyeo"), new KeyValuePair("뎨", "dye"), + new KeyValuePair("도", "do"), new KeyValuePair("돠", "dwa"), new KeyValuePair("돼", "dwae"), new KeyValuePair("됴", "dyo"), new KeyValuePair("두", "du"), new KeyValuePair("둬", "dwo"), new KeyValuePair("뒈", "dwae"), + new KeyValuePair("뒤", "dwi"), new KeyValuePair("듀", "dyu"), new KeyValuePair("드", "deu"), new KeyValuePair("듸", "dui"), new KeyValuePair("디", "di"), + + new KeyValuePair("라", "ra"), new KeyValuePair("래", "rae"), new KeyValuePair("랴", "rya"), new KeyValuePair("럐", "ryae"), new KeyValuePair("러", "reo"), new KeyValuePair("레", "re"), new KeyValuePair("려", "ryeo"), new KeyValuePair("례", "rye"), + new KeyValuePair("로", "ro"), new KeyValuePair("롸", "rwa"), new KeyValuePair("뢔", "roe"), new KeyValuePair("료", "ryo"), new KeyValuePair("루", "ru"), new KeyValuePair("뤄", "rwo"), new KeyValuePair("뤠", "rwae"), new KeyValuePair("뤼", "rwi"), + new KeyValuePair("류", "ryu"), new KeyValuePair("르", "reu"), new KeyValuePair("릐", "rui"), new KeyValuePair("리", "ri"), + + new KeyValuePair("마", "ma"), new KeyValuePair("매", "mae"), new KeyValuePair("먀", "mya"), new KeyValuePair("먜", "myae"), new KeyValuePair("머", "meo"), new KeyValuePair("메", "me"), new KeyValuePair("며", "myeo"), new KeyValuePair("몌", "mye"), + new KeyValuePair("모", "mo"), new KeyValuePair("뫄", "mwa"), new KeyValuePair("뫠", "mwae"), new KeyValuePair("뫼", "moe"), new KeyValuePair("묘", "myo"), new KeyValuePair("무", "mu"), new KeyValuePair("뭐", "mwo"), new KeyValuePair("뭬", "mwae"), + new KeyValuePair("뮈", "mwi"), new KeyValuePair("뮤", "myu"), new KeyValuePair("므", "meu"), new KeyValuePair("믜", "mui"), new KeyValuePair("미", "mi"), + + new KeyValuePair("바", "ba"), new KeyValuePair("배", "bae"), new KeyValuePair("뱌", "bya"), new KeyValuePair("뱨", "byae"), new KeyValuePair("버", "beo"), new KeyValuePair("베", "be"), new KeyValuePair("벼", "byeo"), new KeyValuePair("볘", "bye"), + new KeyValuePair("보", "bo"), new KeyValuePair("봐", "bwa"), new KeyValuePair("봬", "bwae"), new KeyValuePair("뵈", "boe"), new KeyValuePair("뵤", "byo"), new KeyValuePair("부", "bu"), new KeyValuePair("붜", "bwo"), new KeyValuePair("붸", "bwae"), + new KeyValuePair("뷔", "bwi"), new KeyValuePair("뷰", "byu"), new KeyValuePair("브", "beu"), new KeyValuePair("븨", "bui"), new KeyValuePair("비", "bi"), + + new KeyValuePair("사", "sa"), new KeyValuePair("새", "sae"), new KeyValuePair("샤", "sya"), new KeyValuePair("섀", "syae"), new KeyValuePair("서", "seo"), new KeyValuePair("세", "se"), new KeyValuePair("셔", "syeo"), new KeyValuePair("셰", "sye"), + new KeyValuePair("소", "so"), new KeyValuePair("솨", "swa"), new KeyValuePair("쇄", "swae"), new KeyValuePair("쇠", "soe"), new KeyValuePair("쇼", "syo"), new KeyValuePair("수", "su"), new KeyValuePair("숴", "swo"), new KeyValuePair("쉐", "swae"), + new KeyValuePair("쉬", "swi"), new KeyValuePair("슈", "syu"), new KeyValuePair("스", "seu"), new KeyValuePair("싀", "sui"), new KeyValuePair("시", "si"), + + new KeyValuePair("아", "a"), new KeyValuePair("애", "ae"), new KeyValuePair("야", "ya"), new KeyValuePair("얘", "yae"), new KeyValuePair("어", "eo"), new KeyValuePair("에", "e"), new KeyValuePair("여", "yeo"), new KeyValuePair("예", "ye"), + new KeyValuePair("오", "o"), new KeyValuePair("와", "wa"), new KeyValuePair("왜", "wae"), new KeyValuePair("외", "oe"), new KeyValuePair("요", "yo"), new KeyValuePair("우", "u"), new KeyValuePair("워", "wo"), new KeyValuePair("웨", "we"), + new KeyValuePair("위", "wi"), new KeyValuePair("유", "yu"), new KeyValuePair("으", "eu"), new KeyValuePair("의", "ui"), new KeyValuePair("이", "i"), + + new KeyValuePair("자", "ja"), new KeyValuePair("재", "jae"), new KeyValuePair("쟈", "jya"), new KeyValuePair("쟤", "jyae"), new KeyValuePair("저", "jeo"), new KeyValuePair("제", "je"), new KeyValuePair("져", "jyeo"), new KeyValuePair("졔", "jye"), + new KeyValuePair("조", "jo"), new KeyValuePair("좌", "jwa"), new KeyValuePair("좨", "jwae"), new KeyValuePair("죄", "joe"), new KeyValuePair("죠", "jyo"), new KeyValuePair("주", "ju"), new KeyValuePair("줘", "jwo"), new KeyValuePair("줴", "jwae"), + new KeyValuePair("쥐", "jwi"), new KeyValuePair("쥬", "jyu"), new KeyValuePair("즈", "jeu"), new KeyValuePair("즤", "jui"), new KeyValuePair("지", "ji"), + + new KeyValuePair("차", "cha"), new KeyValuePair("채", "chae"), new KeyValuePair("챠", "chya"), new KeyValuePair("챼", "chyae"), new KeyValuePair("처", "cheo"), new KeyValuePair("체", "che"), new KeyValuePair("쳐", "chyeo"), new KeyValuePair("쳬", "chye"), + new KeyValuePair("초", "cho"), new KeyValuePair("촤", "chwa"), new KeyValuePair("쵀", "chwae"), new KeyValuePair("최", "choe"), new KeyValuePair("쵸", "chyo"), new KeyValuePair("추", "chu"), new KeyValuePair("춰", "chwo"), new KeyValuePair("췌", "chwae"), + new KeyValuePair("취", "chwi"), new KeyValuePair("츄", "chyu"), new KeyValuePair("츠", "cheu"), new KeyValuePair("츼", "chui"), new KeyValuePair("치", "chi"), + + new KeyValuePair("카", "ka"), new KeyValuePair("캐", "kae"), new KeyValuePair("캬", "kya"), new KeyValuePair("컈", "kyae"), new KeyValuePair("커", "keo"), new KeyValuePair("케", "ke"), new KeyValuePair("켜", "kyeo"), new KeyValuePair("켸", "kye"), + new KeyValuePair("코", "ko"), new KeyValuePair("콰", "kwa"), new KeyValuePair("쾌", "kwae"), new KeyValuePair("쾨", "koe"), new KeyValuePair("쿄", "kyo"), new KeyValuePair("쿠", "ku"), new KeyValuePair("퀘", "kwo"), + new KeyValuePair("퀴", "kwi"), new KeyValuePair("큐", "kyu"), new KeyValuePair("크", "keu"), new KeyValuePair("킈", "kui"), new KeyValuePair("키", "ki"), + + new KeyValuePair("타", "ta"), new KeyValuePair("태", "tae"), new KeyValuePair("탸", "tya"), new KeyValuePair("턔", "tyae"), new KeyValuePair("터", "teo"), new KeyValuePair("테", "te"), new KeyValuePair("텨", "tyeo"), new KeyValuePair("톄", "tye"), + new KeyValuePair("토", "to"), new KeyValuePair("톼", "twa"), new KeyValuePair("퇘", "twae"), new KeyValuePair("퇴", "toe"), new KeyValuePair("툐", "tyo"), new KeyValuePair("투", "tu"), new KeyValuePair("퉈", "two"), new KeyValuePair("퉤", "twae"), + new KeyValuePair("튀", "twi"), new KeyValuePair("튜", "tyu"), new KeyValuePair("트", "teu"), new KeyValuePair("틔", "tui"), new KeyValuePair("티", "ti"), + + new KeyValuePair("파", "pa"), new KeyValuePair("패", "pae"), new KeyValuePair("퍄", "pya"), new KeyValuePair("퍠", "pyae"), new KeyValuePair("퍼", "peo"), new KeyValuePair("페", "pe"), new KeyValuePair("펴", "pyeo"), new KeyValuePair("폐", "pye"), + new KeyValuePair("포", "po"), new KeyValuePair("퐈", "pwa"), new KeyValuePair("퐤", "pwae"), new KeyValuePair("푀", "poe"), new KeyValuePair("표", "pyo"), new KeyValuePair("푸", "pu"), new KeyValuePair("풔", "pwo"), new KeyValuePair("풰", "pwae"), + new KeyValuePair("퓌", "pwi"), new KeyValuePair("퓨", "pyu"), new KeyValuePair("프", "peu"), new KeyValuePair("픠", "pui"), new KeyValuePair("피", "pi"), + + new KeyValuePair("하", "ha"), new KeyValuePair("해", "hae"), new KeyValuePair("햐", "hya"), new KeyValuePair("햬", "hyae"), new KeyValuePair("허", "heo"), new KeyValuePair("헤", "he"), new KeyValuePair("혀", "hyeo"), new KeyValuePair("혜", "hye"), + new KeyValuePair("호", "ho"), new KeyValuePair("화", "hwa"), new KeyValuePair("홰", "hwae"), new KeyValuePair("회", "hoe"), new KeyValuePair("효", "hyo"), new KeyValuePair("후", "hu"), new KeyValuePair("훠", "hwo"), new KeyValuePair("훼", "hwe"), + new KeyValuePair("휘", "hwi"), new KeyValuePair("류", "ryu"), new KeyValuePair("휴", "hyu"), new KeyValuePair("흐", "heu"), new KeyValuePair("희", "hui"), new KeyValuePair("히", "hi"), + }; + + // Precomputed ordered Korean replacements to avoid repeated sorting + private static readonly List> koreanReplacementsOrdered = + koreanReplacements.OrderByDescending(r => r.Key.Length).ToList(); + + // Korean character similarity groups for enhanced matching + // Expanded to cover more OCR confusions and visual similarities + private static readonly List>> Korean = new List>>() { + // Initial consonants (초성) + new Dictionary>() { + { 0, new List{ 6, 7, 8, 16 } }, // ㄱ, ㄲ, ㄴ, ㄷ + { 1, new List{ 2, 3, 4, 16, 5, 9, 10, 17, 18 } }, // ㄷ, ㄸ, ㄹ, ㅁ, ㅂ, ㅃ, ㅅ, ㅆ, ㅇ, ㅈ, ㅉ, ㅊ, ㅋ, ㅌ, ㅍ, ㅎ + { 2, new List{ 12, 13, 14, 19, 20 } }, // ㅈ, ㅉ, ㅊ, ㅋ, ㅌ + { 3, new List{ 0, 1, 15, 11, 18, 21, 22 } }, // ㄱ, ㄲ, ㅋ, ㅇ, ㅎ, additional visual similarities + { 4, new List{ 1, 5, 6, 7 } }, // ㄹ, ㅁ, ㅂ, ㅃ (rounded shapes) + { 5, new List{ 4, 6, 7, 8 } }, // ㅁ, ㄹ, ㅂ, ㅃ (box-like shapes) + { 6, new List{ 0, 7, 8, 5 } }, // ㅂ, ㄱ, ㅃ, ㅁ + { 7, new List{ 6, 0, 8, 5 } }, // ㅃ, ㅂ, ㄱ, ㅁ + { 8, new List{ 0, 6, 7 } }, // ㅎ, ㄱ, ㅂ, ㅃ + { 9, new List{ 10, 11, 12 } }, // ㅅ, ㅆ, ㅈ (vertical strokes) + { 10, new List{ 9, 11, 12 } }, // ㅆ, ㅅ, ㅈ + { 11, new List{ 9, 10, 12, 13 } }, // ㅇ, ㅅ, ㅆ, ㅈ, ㅉ + { 12, new List{ 9, 10, 11, 13, 14 } }, // ㅈ, ㅅ, ㅆ, ㅇ, ㅉ, ㅊ + { 13, new List{ 12, 14 } }, // ㅉ, ㅈ, ㅊ + { 14, new List{ 12, 13, 15 } }, // ㅊ, ㅈ, ㅉ, ㅋ + { 15, new List{ 3, 14, 16 } }, // ㅋ, ㄱ, ㅎ, ㅊ + { 16, new List{ 3, 15 } }, // ㅌ, ㄱ, ㅋ + { 17, new List{ 18 } }, // ㅍ, ㅎ + { 18, new List{ 3, 17 } } // ㅎ, ㄱ, ㅍ + }, + // Vowels (중성) + new Dictionary>() { + { 0, new List{ 20, 5, 1, 7, 3, 19, 21, 22 } }, // ㅣ, ㅔ, ㅐ, ㅖ, ㅒ, ㅢ, additional vertical vowels + { 1, new List{ 16, 11, 15, 10, 23, 24 } }, // ㅟ, ㅚ, ㅞ, ㅙ, additional compound vowels + { 2, new List{ 4, 0, 6, 2, 14, 9, 25, 26 } }, // ㅓ, ㅏ, ㅕ, ㅑ, ㅝ, ㅘ, additional horizontal vowels + { 3, new List{ 18, 13, 8, 17, 12, 27, 28 } }, // ㅡ, ㅜ, ㅗ, ㅠ, ㅛ, additional horizontal vowels + { 4, new List{ 2, 6, 9, 14 } }, // ㅏ, ㅓ, ㅕ, ㅑ, ㅘ + { 5, new List{ 0, 1, 7, 19 } }, // ㅐ, ㅣ, ㅔ, ㅖ, ㅒ + { 6, new List{ 2, 4, 9, 14 } }, // ㅑ, ㅓ, ㅏ, ㅕ, ㅘ + { 7, new List{ 0, 5, 1, 19 } }, // ㅒ, ㅣ, ㅐ, ㅔ, ㅖ + { 8, new List{ 3, 13, 17, 18 } }, // ㅗ, ㅡ, ㅠ, ㅜ + { 9, new List{ 2, 4, 6, 14 } }, // ㅜ, ㅓ, ㅏ, ㅑ, ㅘ + { 10, new List{ 1, 15, 11, 16 } }, // ㅠ, ㅟ, ㅚ, ㅞ + { 11, new List{ 1, 10, 15, 16 } }, // ㅡ, ㅟ, ㅠ, ㅚ, ㅞ + { 12, new List{ 3, 18, 13, 17 } }, // ㅛ, ㅡ, ㅗ, ㅠ + { 13, new List{ 3, 8, 18, 17 } }, // ㅝ, ㅡ, ㅗ, ㅜ + { 14, new List{ 2, 4, 6, 9 } }, // ㅘ, ㅓ, ㅏ, ㅑ, ㅜ + { 15, new List{ 1, 10, 11, 16 } }, // ㅚ, ㅟ, ㅠ, ㅡ, ㅞ + { 16, new List{ 1, 10, 11, 15 } }, // ㅞ, ㅟ, ㅠ, ㅡ, ㅚ + { 17, new List{ 3, 8, 12, 13 } }, // ㅟ, ㅡ, ㅗ, ㅛ, ㅝ + { 18, new List{ 3, 8, 11, 13 } }, // ㅢ, ㅡ, ㅗ, ㅝ + { 19, new List{ 0, 5, 7, 1 } }, // ㅖ, ㅣ, ㅐ, ㅒ, ㅔ + // Additional compound vowels and visual similarities + { 20, new List{ 0, 5 } }, // ㅔ variants + { 21, new List{ 0, 1 } }, // ㅐ variants + { 22, new List{ 2, 4 } }, // ㅕ variants + { 23, new List{ 3, 8 } }, // ㅛ variants + { 24, new List{ 9, 2 } }, // ㅜ variants + { 25, new List{ 14, 2 } }, // ㅘ variants + { 26, new List{ 13, 3 } }, // ㅝ variants + { 27, new List{ 12, 3 } }, // ㅛ variants + { 28, new List{ 17, 1 } } // ㅟ variants + }, + // Final consonants (종성) + new Dictionary>() { + { 0, new List{ 16, 17, 18, 26, 27, 28 } }, // ㄱ, ㄲ, ㄳ, ㄴ, ㄵ, ㄶ, ㄷ, ㄹ, ㄺ, ㄻ, ㄼ, ㄽ, ㄾ, ㄿ, ㅀ, ㅅ, ㅆ, ㅇ, ㅈ, ㅊ, ㅋ, ㅌ, ㅍ, ㅎ + { 1, new List{ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 19, 20, 25, 29, 30 } }, // ㄴ cluster and similar endings + { 2, new List{ 22, 23, 31, 32 } }, // ㅈ, ㅊ, ㅋ, ㅌ cluster + { 3, new List{ 1, 2, 3, 24, 21, 27, 33 } }, // ㄱ cluster and similar + { 4, new List{ 0 } }, // No final consonant + // Expanded final consonant similarities for OCR + { 5, new List{ 6, 7, 8, 9 } }, // ㄵ, ㄶ, ㄷ, ㄹ similarities + { 6, new List{ 5, 7, 8, 10 } }, // ㄶ, ㄵ, ㄷ, ㄹ similarities + { 7, new List{ 5, 6, 8, 11 } }, // ㄷ, ㄵ, ㄶ, ㄹ similarities + { 8, new List{ 5, 6, 7, 12 } }, // ㄹ, ㄵ, ㄶ, ㄷ similarities + { 9, new List{ 10, 11, 12, 13 } }, // ㄺ, ㄻ, ㄼ, ㄽ similarities + { 10, new List{ 9, 11, 12, 14 } }, // ㄻ, ㄺ, ㄼ, ㄽ similarities + { 11, new List{ 9, 10, 12, 15 } }, // ㄼ, ㄺ, ㄻ, ㄽ similarities + { 12, new List{ 9, 10, 11, 13 } }, // ㄽ, ㄺ, ㄻ, ㄼ similarities + { 13, new List{ 12, 14, 15 } }, // ㄾ, ㄽ, ㄼ, ㄾ similarities + { 14, new List{ 13, 15, 19 } }, // ㄿ, ㄾ, ㄼ, ㅀ similarities + { 15, new List{ 14, 19, 20 } }, // ㅀ, ㄿ, ㅅ, ㅆ similarities + { 16, new List{ 0, 17, 18 } }, // ㄲ, ㄱ, ㄳ similarities + { 17, new List{ 0, 16, 18 } }, // ㄳ, ㄱ, ㄲ similarities + { 18, new List{ 0, 16, 17 } }, // ㄵ, ㄱ, ㄲ, ㄳ similarities + { 19, new List{ 14, 15, 20 } }, // ㅅ, ㄿ, ㅀ, ㅆ similarities + { 20, new List{ 19, 15, 25 } }, // ㅆ, ㅅ, ㅀ, ㅌ similarities + { 21, new List{ 3, 24, 27 } }, // ㅈ, ㄱ, ㄹ, ㅋ similarities + { 22, new List{ 2, 23, 31 } }, // ㅊ, ㅈ, ㅋ similarities + { 23, new List{ 2, 22, 32 } }, // ㅋ, ㅈ, ㅊ, ㅌ similarities + { 24, new List{ 3, 21, 27 } }, // ㅌ, ㄱ, ㅈ, ㅋ similarities + { 25, new List{ 1, 20, 30 } }, // ㅍ, ㄴ, ㅆ, ㅎ similarities + { 26, new List{ 0, 27, 28 } }, // ㄱ, ㄹ, ㅎ similarities + { 27, new List{ 0, 26, 28, 33 } }, // ㄹ, ㄱ, ㅎ, ㅌ similarities + { 28, new List{ 0, 26, 27 } }, // ㅎ, ㄱ, ㄹ similarities + { 29, new List{ 1, 30 } }, // Additional ㄴ variations + { 30, new List{ 25, 29 } }, // Additional ㅍ variations + { 31, new List{ 22, 32 } }, // Additional ㅋ variations + { 32, new List{ 23, 31 } }, // Additional ㅌ variations + { 33, new List{ 3, 27 } } // Additional ㄱ variations + } + }; + + public KoreanLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings) + { + } + + public override string Locale => "ko"; + + public override string[] BlueprintRemovals => new[] { "설계도" }; + + public override string CharacterWhitelist => + string.Concat(GenerateCharacterRangeIterator(0xAC00, 0xC6FF)) + + GenerateCharacterRange(0xC700, 0xD5FF) + + GenerateCharacterRange(0xD600, 0xD7AF) + " "; // Korean Hangul + + public override int CalculateLevenshteinDistance(string s, string t) + { + // i18n korean edit distance algorithm + // Normalize spacing but preserve word boundaries for better OCR fragment matching + s = NormalizeKoreanTextForComparison(s ?? ""); + t = NormalizeKoreanTextForComparison(t ?? ""); + + // Check if both inputs contain Hangul characters for Korean-aware comparison + bool sHasHangul = ContainsHangul(s); + bool tHasHangul = ContainsHangul(t); + + if (sHasHangul && tHasHangul) + { + // Korean-aware path: use original Hangul characters with Korean similarity logic + return CalculateKoreanAwareDistance(s, t); + } + else + { + // Fallback/transliterated path: normalize to Latin equivalents + s = NormalizeKoreanCharacters(s); + t = NormalizeKoreanCharacters(t); + return CalculateStandardDistance(s, t); + } + } + + /// + /// Normalizes Korean text for comparison by only removing spaces + /// Direct OCR to database matching with minimal tampering + /// + private string NormalizeKoreanTextForComparison(string input) + { + if (string.IsNullOrEmpty(input)) return " "; + + string result = NormalizeFullWidthCharacters(input); + + // Remove blueprint equivalents (e.g., "설계도") + foreach (string removal in BlueprintRemovals) + { + if (!string.IsNullOrEmpty(removal)) + { + result = Regex.Replace( + result, + Regex.Escape(removal), + "", + RegexOptions.CultureInvariant); + } + } + + // Remove whitespace (spaces, newlines, tabs) for OCR matching + result = Regex.Replace(result, @"\s+", "", RegexOptions.CultureInvariant); + + // Add leading space to match original algorithm structure + return " " + result; + } + + /// + /// Checks if a string contains any Hangul characters + /// + private static bool ContainsHangul(string input) + { + foreach (char c in input) + { + if (c >= 0xAC00 && c <= 0xD7AF) // Hangul syllables range + return true; + } + return false; + } + + /// + /// Calculates distance using Korean-aware similarity logic + /// + private int CalculateKoreanAwareDistance(string s, string t) + { + int n = s.Length; + int m = t.Length; + + if (n == 0) return m; + if (m == 0) return n; + + int[,] d = new int[n + 1, m + 1]; + + for (int i = 0; i <= n; i++) + d[i, 0] = i; + + for (int j = 0; j <= m; j++) + d[0, j] = j; + + for (int i = 1; i <= n; i++) + { + for (int j = 1; j <= m; j++) + { + int cost = GetKoreanCharacterDifference(s[i - 1], t[j - 1]); + d[i, j] = Math.Min( + Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1), + d[i - 1, j - 1] + cost); + } + } + + return d[n, m]; + } + + /// + /// Calculates standard distance without Korean-specific logic + /// + private int CalculateStandardDistance(string s, string t) + { + int n = s.Length; + int m = t.Length; + + if (n == 0) return m; + if (m == 0) return n; + + int[,] d = new int[n + 1, m + 1]; + + for (int i = 0; i <= n; i++) + d[i, 0] = i; + + for (int j = 0; j <= m; j++) + d[0, j] = j; + + for (int i = 1; i <= n; i++) + { + for (int j = 1; j <= m; j++) + { + int cost = (s[i - 1] == t[j - 1]) ? 0 : 1; + d[i, j] = Math.Min( + Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1), + d[i - 1, j - 1] + cost); + } + } + + return d[n, m]; + } + + public override string NormalizeForPatternMatching(string input) + { + if (string.IsNullOrEmpty(input)) return input; + + string result = NormalizeFullWidthCharacters(input); + + foreach (string removal in BlueprintRemovals) + { + if (!string.IsNullOrEmpty(removal)) + { + result = Regex.Replace( + result, + Regex.Escape(removal), + "", + RegexOptions.CultureInvariant); + } + } + + // Direct OCR to database matching - remove all whitespace + return Regex.Replace(result, @"\s+", "", RegexOptions.CultureInvariant); + } + + public override bool IsPartNameValid(string partName) + { + if (string.IsNullOrEmpty(partName)) return false; + + // Korean item names can be short (e.g. "렉스 프라임" = 5 chars without spaces) + // Use lower threshold than other languages to avoid dropping valid fragments + return Regex.Replace(partName, @"\s+", "", RegexOptions.CultureInvariant).Length >= 4; + } + + public override bool ShouldFilterWord(string word) + { + // Korean filtering: use intelligent analysis instead of hardcoded fragments + + if (string.IsNullOrEmpty(word)) return true; + + // Filter out very short non-Korean garbage (single characters that aren't Hangul) + if (word.Length == 1 && !IsHangulSyllable(word[0])) return true; + + // Keep all Korean text (Hangul characters) since Korean words are meaningful + // even when split by OCR + if (ContainsHangul(word)) return false; + + // For non-Korean text, use standard filtering (filter very short words) + return word.Length < 2; + } + + + /// + /// Gets the character difference cost for Korean characters based on similarity groups + /// + private int GetKoreanCharacterDifference(char a, char b) + { + if (a == b) return 0; + + // Handle Hangul decomposition for Korean-aware comparison + if (IsHangulSyllable(a) && IsHangulSyllable(b)) + { + // Decompose both characters into Jamo indices and compare + var jamoA = DecomposeHangul(a); + var jamoB = DecomposeHangul(b); + + // Compare each component (initial, medial, final) using similarity groups + int totalCost = 0; + + // Compare initial consonants (초성) + totalCost += CompareJamoSimilarity(jamoA.initialIndex, jamoB.initialIndex, 0); + + // Compare medial vowels (중성) + totalCost += CompareJamoSimilarity(jamoA.medialIndex, jamoB.medialIndex, 1); + + // Compare final consonants (종성) + totalCost += CompareJamoSimilarity(jamoA.finalIndex, jamoB.finalIndex, 2); + + return totalCost > 0 ? Math.Min(totalCost, 2) : 0; + } + + // Fallback to original logic for non-Hangul or mixed cases + // Check if characters are in the same similarity group + for (int group = 0; group < Korean.Count; group++) + { + foreach (var similarityGroup in Korean[group]) + { + if (similarityGroup.Value.Contains((int)a) && similarityGroup.Value.Contains((int)b)) + { + return 1; // Similar characters have lower cost + } + } + } + + return 2; // Different characters have higher cost + } + + /// + /// Checks if a character is a Hangul syllable + /// + private static bool IsHangulSyllable(char c) + { + return c >= 0xAC00 && c <= 0xD7AF; + } + + /// + /// Decomposes a Hangul syllable into Jamo component indices + /// + private static (int initialIndex, int medialIndex, int finalIndex) DecomposeHangul(char syllable) + { + if (!IsHangulSyllable(syllable)) + return (-1, -1, -1); + + int syllableIndex = syllable - 0xAC00; + + int finalIndex = syllableIndex % 28; // 0-27 (including no final consonant) + int medialIndex = (syllableIndex / 28) % 21; // 0-20 + int initialIndex = syllableIndex / (28 * 21); // 0-18 + + return (initialIndex, medialIndex, finalIndex); + } + + /// + /// Compares two Jamo indices using Korean similarity groups + /// + private int CompareJamoSimilarity(int indexA, int indexB, int groupType) + { + if (indexA == indexB) return 0; + if (indexA < 0 || indexB < 0) return 2; // Invalid indices + + // Use the Korean similarity groups for the specified type + if (groupType < Korean.Count) + { + foreach (var similarityGroup in Korean[groupType]) + { + // Check both the value list and the key for declared pairs + if ((similarityGroup.Value.Contains(indexA) && similarityGroup.Value.Contains(indexB)) || + (similarityGroup.Key == indexA && similarityGroup.Value.Contains(indexB)) || + (similarityGroup.Key == indexB && similarityGroup.Value.Contains(indexA))) + { + return 1; // Similar Jamo have lower cost + } + } + } + + return 2; // Different Jamo have higher cost + } + + /// + /// Normalizes Korean Hangul characters to Latin equivalents for comparison + /// Uses comprehensive mapping for common OCR confusions and variations + /// + private static string NormalizeKoreanCharacters(string input) + { + if (string.IsNullOrEmpty(input)) return input; + + string result = input; + foreach (var replacement in koreanReplacementsOrdered) + { + result = result.Replace(replacement.Key, replacement.Value); + } + + return result; + } + } +} diff --git a/WFInfo/LanguageProcessing/LanguageProcessor.cs b/WFInfo/LanguageProcessing/LanguageProcessor.cs new file mode 100644 index 00000000..7544ad0f --- /dev/null +++ b/WFInfo/LanguageProcessing/LanguageProcessor.cs @@ -0,0 +1,376 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using Newtonsoft.Json.Linq; +using WFInfo.Settings; + +namespace WFInfo.LanguageProcessing +{ + /// + /// Abstract base class for language-specific OCR text processing + /// Defines the contract that all language processors must implement + /// + public abstract class LanguageProcessor + { + // Per-type normalized blueprint removals to avoid recomputing on every call + private static readonly ConcurrentDictionary _normalizedBlueprintRemovalsCache = new ConcurrentDictionary(); + + protected readonly IReadOnlyApplicationSettings _settings; + protected readonly CultureInfo _culture; + + protected LanguageProcessor(IReadOnlyApplicationSettings settings) + { + _settings = settings ?? throw new ArgumentNullException(nameof(settings)); + _culture = GetCultureInfo(settings.Locale); + + // Initialize normalized blueprint removals once per concrete type + Type concreteType = GetType(); + _normalizedBlueprintRemovalsCache.GetOrAdd(concreteType, type => + { + var blueprintRemovals = BlueprintRemovals; + var normalized = new string[blueprintRemovals.Length]; + for (int i = 0; i < blueprintRemovals.Length; i++) + { + normalized[i] = blueprintRemovals[i].ToLowerInvariant(); + } + return normalized; + }); + } + + /// + /// Gets the appropriate CultureInfo for the locale + /// + /// Locale code + /// CultureInfo instance + private static CultureInfo GetCultureInfo(string locale) + { + try + { + return new CultureInfo(locale, false); + } + catch (Exception e) + { + // Log the failure and offending locale before falling back + System.Diagnostics.Debug.WriteLine($"Failed to create CultureInfo for locale '{locale}': {e.Message}"); + // Fallback to invariant culture for unsupported locales + return CultureInfo.InvariantCulture; + } + } + + /// + /// Gets the CultureInfo for this language processor + /// + public CultureInfo Culture => _culture; + + /// + /// Gets the locale code this processor handles (e.g., "en", "ko", "ja") + /// + public abstract string Locale { get; } + + /// + /// Gets the blueprint removal terms for this language + /// + public abstract string[] BlueprintRemovals { get; } + + /// + /// Gets the Tesseract character whitelist for this language + /// + public abstract string CharacterWhitelist { get; } + + /// + /// Calculates Levenshtein distance between two strings using language-specific logic + /// + /// First string + /// Second string + /// Levenshtein distance + public abstract int CalculateLevenshteinDistance(string s, string t); + + /// + /// Normalizes characters for pattern matching in this language + /// + /// Input string to normalize + /// Normalized string + public abstract string NormalizeForPatternMatching(string input); + + /// + /// Validates if a part name meets minimum length requirements for this language + /// + /// Part name to validate + /// True if valid, false otherwise + public abstract bool IsPartNameValid(string partName); + + /// + /// Validates if a single word fragment should be filtered out during OCR processing + /// + /// Word fragment to validate + /// True if word should be filtered out (removed), false if word should be kept + public virtual bool ShouldFilterWord(string word) + { + // Default implementation: filter very short words (less than 2 characters) + return !string.IsNullOrEmpty(word) && word.Length < 2; + } + + /// + /// Checks if a text fragment is a blueprint term for this language + /// + /// Text fragment to check + /// True if blueprint term, false otherwise + public virtual bool IsBlueprintTerm(string text) + { + if (string.IsNullOrEmpty(text)) return false; + + // Normalize text for case-insensitive comparison + string normalizedText = text.ToLowerInvariant(); + + // Check against pre-normalized blueprint removal terms + // Handle common formats: standalone terms, in parentheses, etc. + var normalizedBlueprintRemovals = _normalizedBlueprintRemovalsCache[GetType()]; + for (int i = 0; i < normalizedBlueprintRemovals.Length; i++) + { + string normalizedRemoval = normalizedBlueprintRemovals[i]; + if (normalizedText.Contains(normalizedRemoval) || + normalizedText.StartsWith($"({normalizedRemoval}") || + normalizedText.EndsWith($"{normalizedRemoval})")) + { + return true; + } + } + return false; + } + + /// + /// Gets localized name data from market items using language-specific matching + /// + /// Input string to match + /// Market items dictionary + /// Whether to use full Levenshtein distance + /// Best matching localized name + public virtual string GetLocalizedNameData(string input, JObject marketItems, bool useLevenshtein) + { + if (string.IsNullOrEmpty(input) || marketItems == null) + return input; + + string bestMatch = input; + int bestDistance = int.MaxValue; + + foreach (KeyValuePair item in marketItems) + { + if (item.Key == "version") continue; + + string[] split = item.Value.ToString().Split('|'); + if (split.Length < 3) continue; + + string localizedName = split[2]; + if (string.IsNullOrEmpty(localizedName)) continue; + + // Skip if length difference is too large + int lengthDiff = Math.Abs(input.Length - localizedName.Length); + if (lengthDiff > localizedName.Length / 2) continue; + + int distance; + if (useLevenshtein) + { + distance = CalculateLevenshteinDistance(input, localizedName); + } + else + { + string normalizedInput = NormalizeForPatternMatching(input); + string normalizedStored = NormalizeForPatternMatching(localizedName); + distance = SimpleLevenshteinDistance(normalizedInput, normalizedStored); + } + + // Only accept matches that are reasonably close (less than 50% difference) + if (distance < bestDistance && distance < localizedName.Length * 0.5) + { + bestDistance = distance; + bestMatch = split[0]; // Return the English name + } + } + + return bestMatch; + } + + /// + /// Default Levenshtein distance implementation for languages that don't need special handling + /// + protected virtual int DefaultLevenshteinDistance(string s, string t) + { + s = s.ToLower(_culture); + t = t.ToLower(_culture); + return ComputeLevenshteinCore(s, t); + } + + /// + /// Simple Levenshtein distance that avoids circular dependencies + /// + public int SimpleLevenshteinDistance(string s, string t) + { + return ComputeLevenshteinCore(s, t); + } + + /// + /// Helper method for Levenshtein distance with preprocessing + /// + protected int LevenshteinDistanceWithPreprocessing(string s, string t, string[] blueprintRemovals, Func normalizer = null, bool callBaseDefault = false) + { + // Remove blueprint equivalents + s = " " + s; + t = " " + t; + + if (blueprintRemovals != null) + { + foreach (string removal in blueprintRemovals) + { + if (!string.IsNullOrEmpty(removal)) + { + s = System.Text.RegularExpressions.Regex.Replace(s, System.Text.RegularExpressions.Regex.Escape(removal), "", System.Text.RegularExpressions.RegexOptions.IgnoreCase | System.Text.RegularExpressions.RegexOptions.CultureInvariant); + t = System.Text.RegularExpressions.Regex.Replace(t, System.Text.RegularExpressions.Regex.Escape(removal), "", System.Text.RegularExpressions.RegexOptions.IgnoreCase | System.Text.RegularExpressions.RegexOptions.CultureInvariant); + } + } + } + + s = s.Replace(" ", ""); + t = t.Replace(" ", ""); + + // Apply character normalization if provided + if (normalizer != null) + { + s = normalizer(s); + t = normalizer(t); + } + + return callBaseDefault ? ComputeLevenshteinCore(s, t) : DefaultLevenshteinDistance(s, t); + } + + /// + /// Core Levenshtein distance implementation (non-virtual) + /// + private static int ComputeLevenshteinCore(string s, string t) + { + int n = s.Length; + int m = t.Length; + int[,] d = new int[n + 1, m + 1]; + + if (n == 0) return m; + if (m == 0) return n; + + for (int i = 0; i <= n; i++) + d[i, 0] = i; + + for (int j = 0; j <= m; j++) + d[0, j] = j; + + for (int i = 1; i <= n; i++) + { + for (int j = 1; j <= m; j++) + { + int cost = (t[j - 1] == s[i - 1]) ? 0 : 1; + + d[i, j] = Math.Min( + Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1), + d[i - 1, j - 1] + cost); + } + } + + return d[n, m]; + } + + /// + /// Removes diacritic marks from text + /// + protected static string RemoveAccents(string text) + { + if (string.IsNullOrEmpty(text)) return text; + + string normalized = text.Normalize(NormalizationForm.FormD); + StringBuilder sb = new StringBuilder(); + + foreach (char c in normalized) + { + if (System.Globalization.CharUnicodeInfo.GetUnicodeCategory(c) != System.Globalization.UnicodeCategory.NonSpacingMark) + sb.Append(c); + } + + return sb.ToString().Normalize(NormalizationForm.FormC); + } + + /// + /// Converts full-width characters to half-width (for CJK languages) + /// + protected static string NormalizeFullWidthCharacters(string input) + { + if (string.IsNullOrEmpty(input)) + { + return input ?? string.Empty; + } + + var result = new System.Text.StringBuilder(input.Length); + + foreach (char c in input) + { + if (c == '\u3000') // Fullwidth space + { + result.Append(' '); + } + else if (c >= '\uFF01' && c <= '\uFF5E') // Fullwidth ASCII range + { + result.Append((char)(c - 0xFEE0)); + } + else + { + result.Append(c); // Leave other characters unchanged + } + } + + return result.ToString(); + } + + /// + /// Maximum safe size for character range generation to prevent memory issues + /// + private const int MaxGeneratedRangeSize = 10000; + + /// + /// Generates a string containing all characters in the specified Unicode range + /// + /// Starting Unicode code point + /// Ending Unicode code point + /// String containing all characters in the range + /// Thrown when range size exceeds safe limit + protected static string GenerateCharacterRange(int start, int end) + { + int rangeSize = end - start + 1; + if (rangeSize > MaxGeneratedRangeSize) + { + throw new ArgumentOutOfRangeException(nameof(end), + $"Character range size ({rangeSize}) exceeds maximum safe limit ({MaxGeneratedRangeSize}). " + + $"Use GenerateCharacterRangeIterator for large ranges."); + } + + var chars = new char[rangeSize]; + for (int i = 0; i < rangeSize; i++) + { + chars[i] = (char)(start + i); + } + return new string(chars); + } + + /// + /// Generates characters in the specified Unicode range using streaming (no large array allocation) + /// + /// Starting Unicode code point + /// Ending Unicode code point + /// Enumerable that yields characters in the range + protected static IEnumerable GenerateCharacterRangeIterator(int start, int end) + { + for (int i = start; i <= end; i++) + { + yield return (char)i; + } + } + } +} diff --git a/WFInfo/LanguageProcessing/LanguageProcessorFactory.cs b/WFInfo/LanguageProcessing/LanguageProcessorFactory.cs new file mode 100644 index 00000000..59a90070 --- /dev/null +++ b/WFInfo/LanguageProcessing/LanguageProcessorFactory.cs @@ -0,0 +1,163 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using WFInfo.Settings; + +namespace WFInfo.LanguageProcessing +{ + /// + /// Factory class for managing language processors + /// Provides centralized access to language-specific OCR text processing + /// + public class LanguageProcessorFactory + { + private static readonly Dictionary _processors = new Dictionary(); + private static readonly object _lock = new object(); + private static IReadOnlyApplicationSettings _settings; + + /// + /// Initializes the factory with application settings + /// + /// Application settings + public static void Initialize(IReadOnlyApplicationSettings settings) + { + if (settings == null) + throw new ArgumentNullException(nameof(settings)); + + _settings = settings; + } + + /// + /// Gets the language processor for the specified locale + /// + /// Locale code (e.g., "en", "ko", "ja") + /// Language processor for the locale + public static LanguageProcessor GetProcessor(string locale) + { + if (string.IsNullOrEmpty(locale)) + locale = "en"; + + lock (_lock) + { + if (_processors.TryGetValue(locale, out LanguageProcessor processor)) + return processor; + + // Create new processor if not exists + processor = CreateProcessor(locale); + _processors[locale] = processor; + return processor; + } + } + + /// + /// Gets the current language processor based on settings + /// + /// Current language processor + public static LanguageProcessor GetCurrentProcessor() + { + if (_settings == null) + throw new InvalidOperationException("Factory not initialized. Call Initialize() first."); + + return GetProcessor(_settings.Locale); + } + + /// + /// Gets all supported locales + /// + /// Array of supported locale codes + public static string[] GetSupportedLocales() + { + return new[] + { + "en", // English + "ko", // Korean + "ja", // Japanese + "zh-hans", // Simplified Chinese + "zh-hant", // Traditional Chinese + "th", // Thai + "ru", // Russian + "uk", // Ukrainian + "tr", // Turkish + "pl", // Polish + "fr", // French + "de", // German + "es", // Spanish + "pt", // Portuguese + "it" // Italian + }; + } + + /// + /// Creates a language processor for the specified locale + /// + /// Locale code + /// New language processor instance + private static LanguageProcessor CreateProcessor(string locale) + { + if (_settings == null) + throw new InvalidOperationException("Factory not initialized. Call Initialize() first."); + + locale = locale.ToLowerInvariant(); + switch (locale) + { + case "en": + return new EnglishLanguageProcessor(_settings); + case "ko": + return new KoreanLanguageProcessor(_settings); + case "ja": + return new JapaneseLanguageProcessor(_settings); + case "zh-hans": + return new SimplifiedChineseLanguageProcessor(_settings); + case "zh-hant": + return new TraditionalChineseLanguageProcessor(_settings); + case "th": + return new ThaiLanguageProcessor(_settings); + case "ru": + return new RussianLanguageProcessor(_settings); + case "uk": + return new UkrainianLanguageProcessor(_settings); + case "tr": + return new TurkishLanguageProcessor(_settings); + case "pl": + return new PolishLanguageProcessor(_settings); + case "fr": + return new FrenchLanguageProcessor(_settings); + case "de": + return new GermanLanguageProcessor(_settings); + case "es": + return new SpanishLanguageProcessor(_settings); + case "pt": + return new PortugueseLanguageProcessor(_settings); + case "it": + return new ItalianLanguageProcessor(_settings); + default: + return new EnglishLanguageProcessor(_settings); // Default to English + } + } + + /// + /// Clears all cached processors + /// Useful for testing or when settings change + /// + public static void ClearCache() + { + lock (_lock) + { + _processors.Clear(); + } + } + + /// + /// Checks if a locale is supported + /// + /// Locale code to check + /// True if supported, false otherwise + public static bool IsLocaleSupported(string locale) + { + if (string.IsNullOrEmpty(locale)) + return false; + + return GetSupportedLocales().Contains(locale, StringComparer.OrdinalIgnoreCase); + } + } +} diff --git a/WFInfo/LanguageProcessing/PolishLanguageProcessor.cs b/WFInfo/LanguageProcessing/PolishLanguageProcessor.cs new file mode 100644 index 00000000..78241aa8 --- /dev/null +++ b/WFInfo/LanguageProcessing/PolishLanguageProcessor.cs @@ -0,0 +1,78 @@ +using System; +using System.Text.RegularExpressions; +using WFInfo.Settings; + +namespace WFInfo.LanguageProcessing +{ + /// + /// Polish language processor for OCR text processing + /// Handles Polish characters with specific diacritics + /// + public class PolishLanguageProcessor : LanguageProcessor + { + public PolishLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings) + { + } + + public override string Locale => "pl"; + + public override string[] BlueprintRemovals => new[] { "Plan", "Schemat" }; + + public override string CharacterWhitelist => "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz " + GenerateCharacterRange(0x0104, 0x0107) + GenerateCharacterRange(0x0118, 0x0119) + GenerateCharacterRange(0x0141, 0x0144) + GenerateCharacterRange(0x015A, 0x015A) + "\u00d3\u00f3\u015a\u015b\u0179\u017a\u017b\u017c"; // Polish with ranges + missing letters + + public override int CalculateLevenshteinDistance(string s, string t) + { + return LevenshteinDistanceWithPreprocessing(s, t, BlueprintRemovals, NormalizePolishCharacters, callBaseDefault: true); + } + + public override string NormalizeForPatternMatching(string input) + { + if (string.IsNullOrEmpty(input)) return input; + + // Basic cleanup for Polish + string normalized = input.ToLower(_culture).Trim(); + + // Add spaces around "Prime" to match database format better + normalized = normalized.Replace("prime", " prime "); + + // Remove accents (not typically needed for Polish as it has specific diacritics) + normalized = RemoveAccents(normalized); + + // Remove extra spaces + var parts = normalized.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); + return string.Join(" ", parts); + } + + public override bool IsPartNameValid(string partName) + { + // Polish requires minimum of 8 characters + return !string.IsNullOrEmpty(partName) && partName.Length >= 8; + } + + + /// + /// Normalizes Polish characters to standard equivalents for comparison + /// + private static string NormalizePolishCharacters(string input) + { + // Convert Polish characters to standard equivalents for comparison + return input.ToLowerInvariant() + .Replace('ą', 'a') + .Replace('Ą', 'A') + .Replace('ę', 'e') + .Replace('Ę', 'E') + .Replace('ć', 'c') + .Replace('Ć', 'C') + .Replace('ł', 'l') + .Replace('Ł', 'L') + .Replace('ś', 's') + .Replace('Ś', 'S') + .Replace('ź', 'z') + .Replace('Ź', 'Z') + .Replace('ż', 'z') + .Replace('Ż', 'Z') + .Replace('ó', 'o') + .Replace('Ó', 'O'); + } + } +} diff --git a/WFInfo/LanguageProcessing/ThaiLanguageProcessor.cs b/WFInfo/LanguageProcessing/ThaiLanguageProcessor.cs new file mode 100644 index 00000000..5d23ee61 --- /dev/null +++ b/WFInfo/LanguageProcessing/ThaiLanguageProcessor.cs @@ -0,0 +1,212 @@ +using System; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using WFInfo.Settings; + +namespace WFInfo.LanguageProcessing +{ + /// + /// Thai language processor for OCR text processing + /// Handles Thai characters with tone mark normalization + /// + public class ThaiLanguageProcessor : LanguageProcessor + { + public ThaiLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings) + { + } + + public override string Locale => "th"; + + public override string[] BlueprintRemovals => new[] { "แบบแปลน", "ภาพวาด" }; + + public override string CharacterWhitelist => GenerateCharacterRange(0x0E00, 0x0E7F) + " "; // Thai characters + + public override int CalculateLevenshteinDistance(string s, string t) + { + // Check if both inputs contain Thai characters for Thai-aware comparison + bool sHasThai = ContainsThai(s); + bool tHasThai = ContainsThai(t); + + if (sHasThai && tHasThai) + { + // Thai-aware path: use original Thai characters with Thai similarity logic + return CalculateThaiAwareDistance(s, t); + } + else + { + // Fallback/transliterated path: normalize to Latin equivalents + return LevenshteinDistanceWithPreprocessing(s, t, BlueprintRemovals, NormalizeThaiCharacters, callBaseDefault: true); + } + } + + /// + /// Calculates Thai-aware Levenshtein distance with character similarity groups + /// + private int CalculateThaiAwareDistance(string s, string t) + { + if (string.IsNullOrEmpty(s)) return string.IsNullOrEmpty(t) ? 0 : t.Length; + if (string.IsNullOrEmpty(t)) return s.Length; + + int n = s.Length; + int m = t.Length; + + if (n == 0) return m; + if (m == 0) return n; + + int[,] d = new int[n + 1, m + 1]; + + for (int i = 0; i <= n; i++) + d[i, 0] = i; + + for (int j = 0; j <= m; j++) + d[0, j] = j; + + for (int i = 1; i <= n; i++) + { + for (int j = 1; j <= m; j++) + { + int cost = GetThaiCharacterDifference(s[i - 1], t[j - 1]); + d[i, j] = Math.Min( + Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1), + d[i - 1, j - 1] + cost); + } + } + + return d[n, m]; + } + + /// + /// Gets the character difference cost for Thai characters based on similarity groups + /// + private int GetThaiCharacterDifference(char a, char b) + { + if (a == b) return 0; + + // Similar looking Thai characters (common OCR confusions) + var similarChars = new[] + { + new[] {'ก', 'ฮ'}, // ko/ho - similar round shapes + new[] {'ด', 'ป'}, // do/po - similar loops + new[] {'ต', 'ถ'}, // to/tho - similar shapes + new[] {'บ', 'ป'}, // bo/po - similar loops + new[] {'อ', 'โ'}, // o/o - different forms + new[] {'ผ', 'ฝ'}, // pho/fo - similar shapes + new[] {'ซ', 'ศ', 'ษ'}, // so variations + new[] {'ง', 'ย'}, // ngo/yo - similar tails + new[] {'ม', 'น'}, // mo/no - similar curves + new[] {'ว', 'ใ'}, // wo/ai - similar shapes + }; + + foreach (var pair in similarChars) + { + if ((a == pair[0] && b == pair[1]) || (a == pair[1] && b == pair[0])) + return 1; // Low cost for similar looking characters + if (pair.Length == 3 && + ((a == pair[0] && b == pair[1]) || (a == pair[1] && b == pair[0]) || + (a == pair[0] && b == pair[2]) || (a == pair[2] && b == pair[0]) || + (a == pair[1] && b == pair[2]) || (a == pair[2] && b == pair[1]))) + return 1; + } + + // Tone mark confusions (lower cost for tone differences) + var toneMarks = new[] {'่', '้', '๊', '๋', '่', '้', '๊', '๋'}; // Different tone marks + bool aIsTone = toneMarks.Contains(a); + bool bIsTone = toneMarks.Contains(b); + if (aIsTone && bIsTone) return 1; // Low cost for tone mark differences + + // Default cost for different characters + return 2; + } + + public override bool ShouldFilterWord(string word) + { + if (string.IsNullOrEmpty(word)) return true; + + bool hasThai = ContainsThai(word); + bool hasLatin = false; + foreach (char c in word) + { + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) + { + hasLatin = true; + break; + } + } + + // Keep all Thai text since Thai words are meaningful even when split by OCR + if (hasThai) return false; + + // For mixed Thai-Latin words, be more lenient + if (hasThai && hasLatin) return false; + + // For non-Thai text, use standard filtering (filter very short words) + return word.Length < 2; + } + + /// + /// Checks if a string contains Thai characters + /// + private static bool ContainsThai(string input) + { + foreach (char c in input) + { + // Thai Unicode range (0x0E00-0x0E7F) + if (c >= 0x0E00 && c <= 0x0E7F) return true; + } + return false; + } + + public override string NormalizeForPatternMatching(string input) + { + if (string.IsNullOrEmpty(input)) return input; + + // Basic cleanup for Thai + string normalized = input.ToLower(_culture).Trim(); + + // Add spaces around "Prime" to match database format better + normalized = normalized.Replace("prime", " prime "); + + // Remove accents (not typically needed for Thai - preserve tone/vowel marks) + // normalized = RemoveAccents(normalized); + + // Remove extra spaces + var parts = normalized.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); + return string.Join(" ", parts); + } + + public override bool IsPartNameValid(string partName) + { + // Thai requires minimum of 4 characters after removing spaces + return !string.IsNullOrEmpty(partName) && partName.Replace(" ", "").Length >= 4; + } + + + /// + /// Normalizes Thai characters for comparison + /// + private static string NormalizeThaiCharacters(string input) + { + string result = NormalizeFullWidthCharacters(input); + + // Basic Thai tone mark normalization + result = result.Normalize(System.Text.NormalizationForm.FormC); + + // Common Thai OCR confusions and character variations + result = result.Replace('ซ', 'ศ').Replace('ศ', 'ษ'); // so variations normalization + result = result.Replace('ผ', 'ฝ'); // pho/fo confusion + result = result.Replace('บ', 'ป'); // bo/po confusion + result = result.Replace('ด', 'ต'); // do/to confusion + result = result.Replace('อ', 'โ'); // o/o form variations + + // Remove or normalize common diacritic issues + result = result.Replace("์", ""); // Remove karan (silent marker) for comparison + + // Normalize similar vowel forms + result = result.Replace('ใ', 'ไ'); // ai vowel variations + result = result.Replace('ำ', 'ํ'); // am vowel variations + + return result.ToLowerInvariant(); + } + } +} diff --git a/WFInfo/LanguageProcessing/TurkishLanguageProcessor.cs b/WFInfo/LanguageProcessing/TurkishLanguageProcessor.cs new file mode 100644 index 00000000..5e797522 --- /dev/null +++ b/WFInfo/LanguageProcessing/TurkishLanguageProcessor.cs @@ -0,0 +1,74 @@ +using System; +using System.Text.RegularExpressions; +using WFInfo.Settings; + +namespace WFInfo.LanguageProcessing +{ + /// + /// Turkish language processor for OCR text processing + /// Handles Turkish characters with special diacritics + /// + public class TurkishLanguageProcessor : LanguageProcessor + { + public TurkishLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings) + { + } + + public override string Locale => "tr"; + + public override string[] BlueprintRemovals => new[] { "Plan", "Şema" }; + + public override string CharacterWhitelist => "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz " + "ÇçĞğİıÖöŞşÜü"; // Turkish-specific characters + + public override int CalculateLevenshteinDistance(string s, string t) + { + return LevenshteinDistanceWithPreprocessing(s, t, BlueprintRemovals, NormalizeTurkishCharacters, callBaseDefault: true); + } + + public override string NormalizeForPatternMatching(string input) + { + if (string.IsNullOrEmpty(input)) return input; + + // Basic cleanup for Turkish + string normalized = input.ToLower(_culture).Trim(); + + // Add spaces around "Prime" to match database format better + normalized = normalized.Replace("prime", " prime "); + + // Remove accents (not typically needed for Turkish as it has specific diacritics) + normalized = RemoveAccents(normalized); + + // Remove extra spaces + var parts = normalized.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); + return string.Join(" ", parts); + } + + public override bool IsPartNameValid(string partName) + { + // Turkish requires minimum of 6 characters after removing spaces + return !string.IsNullOrEmpty(partName) && partName.Replace(" ", "").Length >= 6; + } + + + /// + /// Normalizes Turkish characters to standard equivalents for comparison + /// + private static string NormalizeTurkishCharacters(string input) + { + // Convert Turkish characters to standard equivalents for comparison + return input.ToLowerInvariant() + .Replace('ğ', 'g') + .Replace('Ğ', 'G') + .Replace('ş', 's') + .Replace('Ş', 'S') + .Replace('ç', 'c') + .Replace('Ç', 'C') + .Replace('ö', 'o') + .Replace('Ö', 'O') + .Replace('ü', 'u') + .Replace('Ü', 'U') + .Replace('ı', 'i') + .Replace('İ', 'I'); + } + } +} diff --git a/WFInfo/Main.cs b/WFInfo/Main.cs index 7eb28807..8f16ebd2 100644 --- a/WFInfo/Main.cs +++ b/WFInfo/Main.cs @@ -95,10 +95,13 @@ public static async Task UpdateMarketStatusAsync(string msg) } // Use async UI dispatcher call - await MainWindow.INSTANCE.Dispatcher.InvokeAsync(() => - { - MainWindow.INSTANCE.UpdateMarketStatus(msg); - }); + var wnd = MainWindow.INSTANCE; + var disp = wnd?.Dispatcher; + if (disp != null) + await disp.InvokeAsync(() => + { + wnd.UpdateMarketStatus(msg); + }); } private static IServiceCollection ConfigureServices(IServiceCollection services) @@ -262,7 +265,14 @@ await Task.Run(async () => public static void RunOnUIThread(Action act) { - MainWindow.INSTANCE.Dispatcher.Invoke(act); + var mw = MainWindow.INSTANCE; + if (mw?.Dispatcher != null && !mw.Dispatcher.HasShutdownStarted && !mw.Dispatcher.HasShutdownFinished) + { + if (mw.Dispatcher.CheckAccess()) + act(); + else + mw.Dispatcher.Invoke(act); + } } public static void StartMessage() @@ -298,7 +308,7 @@ public static void AddLog(string argm) /// 0 = normal, 1 = red, 2 = orange, 3 =yellow public static void StatusUpdate(string message, int severity) { - MainWindow.INSTANCE.Dispatcher.Invoke(() => { MainWindow.INSTANCE.ChangeStatus(message, severity); }); + RunOnUIThread(() => { MainWindow.INSTANCE.ChangeStatus(message, severity); }); } public void ActivationKeyPressed(Object key) @@ -540,7 +550,7 @@ private void LoadScreenshot(ScreenshotType type) // Switch to logged in mode for warfrane.market systems public void LoggedIn() { //this is bullshit, but I couldn't call it in login.xaml.cs because it doesn't properly get to the main window - MainWindow.INSTANCE.Dispatcher.Invoke(() => { MainWindow.INSTANCE.LoggedIn(); }); + RunOnUIThread(() => { MainWindow.INSTANCE.LoggedIn(); }); // start the AFK timer latestActive = DateTime.UtcNow.AddMinutes(1); @@ -610,7 +620,7 @@ public static int VersionToInteger(string vers) public static void SignOut() { - MainWindow.INSTANCE.Dispatcher.Invoke(() => { MainWindow.INSTANCE.SignOut(); }); + RunOnUIThread(() => { MainWindow.INSTANCE.SignOut(); }); } } diff --git a/WFInfo/Ocr.cs b/WFInfo/Ocr.cs index c59d8b39..fff6be58 100644 --- a/WFInfo/Ocr.cs +++ b/WFInfo/Ocr.cs @@ -16,6 +16,7 @@ using WFInfo.Services.Screenshot; using WFInfo.Services.WindowInfo; using WFInfo.Settings; +using WFInfo.LanguageProcessing; using Brushes = System.Drawing.Brushes; using Clipboard = System.Windows.Forms.Clipboard; using Color = System.Drawing.Color; @@ -97,13 +98,50 @@ class OCR // UI - Scaling used in Warframe public static double uiScaling; - public static Regex RE = new Regex("[^a-z가-힣]", RegexOptions.IgnoreCase | RegexOptions.Compiled); + // Language-specific regex patterns are now handled by CharacterWhitelist in Tesseract + // No post-processing filtering needed since Tesseract handles character filtering at source // Pixel measurements for reward screen @ 1920 x 1080 with 100% scale https://docs.google.com/drawings/d/1Qgs7FU2w1qzezMK-G1u9gMTsQZnDKYTEU36UPakNRJQ/edit public const int pixleRewardWidth = 968; public const int pixleRewardHeight = 235; public const int pixleRewardYDisplay = 316; public const int pixelRewardLineHeight = 48; + + // CJK language detection helper - Korean, Simplified Chinese, Traditional Chinese share similar OCR needs + private static bool IsCJKLocale() + { + var locale = ApplicationSettings.GlobalReadonlySettings.Locale; + return locale == "ko" || locale == "zh-hans" || locale == "zh-hant" || locale == "ja"; + } + + // CJK-specific adjustments for multi-line text + private static int GetAdjustedLineHeight() + { + // CJK text needs slightly more vertical space for multi-line wrapping + return IsCJKLocale() ? 58 : pixelRewardLineHeight; + } + + /// + /// Safe call helper to execute functions with consistent error handling and logging + /// + /// Return type of the function + /// Function to execute + /// Default value to return on error + /// Name of the operation for logging + /// Name of the item being processed + /// Result of the function or default value on error + private static T SafeCall(Func func, T defaultValue, string operationName, string itemName) + { + try + { + return func(); + } + catch (Exception ex) + { + Main.AddLog($"ERROR: {operationName} failed for '{itemName}': {ex.Message}"); + return defaultValue; + } + } public const int SCALING_LIMIT = 100; public static bool processingActive = false; @@ -140,19 +178,36 @@ public static void Init(ITesseractService tesseractService, ISoundPlayer soundPl { Directory.CreateDirectory(Main.AppPath + @"\Debug"); _tesseractService = tesseractService; - _tesseractService.Init(); _soundPlayer = soundPlayer; _settings = settings; _window = window; - _hdrDetector = hdrDetector; - _gdiScreenshot = gdiScreenshot; _windowsScreenshot = windowsScreenshot; + _hdrDetector = hdrDetector; + + // Initialize the language processor factory before tesseract service + LanguageProcessorFactory.Initialize(settings); + + try + { + _tesseractService.Init(); + } + catch (Exception ex) + { + Main.AddLog($"ERROR: Failed to initialize TesseractService: {ex.Message}"); + _tesseractService = null; + } } internal static void ProcessRewardScreen(Bitmap file = null) { #region initializers + if (_tesseractService == null) + { + Main.AddLog("ERROR: Cannot process reward screen - TesseractService is null"); + return; + } + if (processingActive) { Main.StatusUpdate("Still Processing Reward Screen", 2); @@ -249,11 +304,28 @@ internal static void ProcessRewardScreen(Bitmap file = null) string part = firstChecks[i]; #region found a part string correctName = Main.dataBase.GetPartName(part, out firstProximity[i], false, out _); + + // Filter out results with excessively high Levenshtein distances (indicating no valid match) + // 9999 is the default value when no match was found, and anything above 50% of string length is likely invalid + if (firstProximity[i] == 9999 || firstProximity[i] > GetMaxAllowedLevenshteinDistance(part.Length) || string.IsNullOrEmpty(correctName)) + { + Main.AddLog($"Rejected junk match: '{part}' with distance {firstProximity[i]}"); + continue; // Skip this part entirely + } + string primeSetName = Data.GetSetName(correctName); JObject job = (JObject)Main.dataBase.marketData.GetValue(correctName); JObject primeSet = (JObject)Main.dataBase.marketData.GetValue(primeSetName); + + // Guard against null market data + if (job == null || job["ducats"] == null) + { + Main.AddLog($"MARKET DATA: No market data or ducats found for '{correctName}', skipping"); + continue; + } + string ducats = job["ducats"].ToObject(); - if (int.Parse(ducats, Main.culture) == 0) + if (!int.TryParse(ducats, System.Globalization.NumberStyles.Integer, Main.culture, out int ducatValue) || ducatValue == 0) { hideRewardInfo = true; } @@ -271,7 +343,7 @@ internal static void ProcessRewardScreen(Bitmap file = null) bool mastered = Main.dataBase.IsPartMastered(correctName); string partsOwned = Main.dataBase.PartsOwned(correctName); string partsCount = Main.dataBase.PartsCount(correctName); - int duc = int.Parse(ducats, Main.culture); + int duc = ducatValue; #endregion #region highlighting @@ -535,7 +607,7 @@ private static bool CheckIfError() /// public static WFtheme GetThemeWeighted(out double closestThresh, Bitmap image = null) { - int lineHeight = (int)(pixelRewardLineHeight / 2 * _window.ScreenScaling); + int lineHeight = (int)(GetAdjustedLineHeight() / 2 * _window.ScreenScaling); // int width = image == null ? window.Width * (int)_window.DpiScaling : image.Width; // int height = image == null ? window.Height * (int)_window.DpiScaling : image.Height; int mostWidth = (int)(pixleRewardWidth * _window.ScreenScaling); @@ -637,12 +709,21 @@ private static WFtheme GetClosestTheme(Color clr, out int threshold) /// /// Scanned part name /// If part name is close enough to valid to actually process - internal static bool PartNameValid (string partName) + internal static bool PartNameValid(string partName) { - if ((partName.Length < 13 && _settings.Locale == "en") || (partName.Replace(" ", "").Length < 6 && _settings.Locale == "ko")) // if part name is smaller than "Bo prime handle" skip current part - //TODO: Add a min character for other locale here. - return false; - return true; + var processor = LanguageProcessorFactory.GetCurrentProcessor(); + return processor?.IsPartNameValid(partName) ?? false; + } + + /// + /// Gets the maximum allowed Levenshtein distance threshold for part name matching + /// + /// Length of the part name + /// Maximum allowed Levenshtein distance + private static int GetMaxAllowedLevenshteinDistance(int partNameLength) + { + // Use 50% of string length with a minimum floor of 3 for consistency + return Math.Max((int)Math.Ceiling(partNameLength * 0.5), 3); } /// @@ -681,12 +762,26 @@ internal static void ProcessSnapIt(Bitmap snapItImage, Bitmap fullShot, Point sn var part = foundParts[i]; if (!PartNameValid(part.Name)) { - foundParts.RemoveAt(i--); //remove invalid part from list to not clog VerifyCount. Decrement to not skip any entries + Main.AddLog($"SnapIt: Rejected invalid part name: \"{part.Name}\" (length after trim: {part.Name?.Replace(" ", "").Length ?? 0})"); + foundParts.RemoveAt(i); //remove invalid part from list to not clog VerifyCount + i--; // Adjust index since we removed an item resultCount--; continue; } - Debug.WriteLine($"Part {foundParts.IndexOf(part)} out of {foundParts.Count}"); + Debug.WriteLine($"Part {i} out of {foundParts.Count}"); string name = Main.dataBase.GetPartName(part.Name, out int levenDist, false, out bool multipleLowest); + + // Filter out results with excessively high Levenshtein distances (indicating no valid match) + // 9999 is the default value when no match was found, and anything above 50% of string length is likely invalid + // Also check for null names (can happen with non-English languages when no match was found) + if (levenDist == 9999 || levenDist > GetMaxAllowedLevenshteinDistance(part.Name.Length) || string.IsNullOrEmpty(name)) + { + foundParts.RemoveAt(i); // remove invalid part from list + i--; // Adjust index since we removed an item + resultCount--; + continue; + } + string primeSetName = Data.GetSetName(name); if (levenDist > Math.Min(part.Name.Length, name.Length) / 3 || multipleLowest) { @@ -696,8 +791,19 @@ internal static void ProcessSnapIt(Bitmap snapItImage, Bitmap fullShot, Point sn bool doWarn = part.Warning; part.Name = name; foundParts[i] = part; - JObject job = Main.dataBase.marketData.GetValue(name).ToObject(); - JObject primeSet = (JObject)Main.dataBase.marketData.GetValue(primeSetName); + + // Safely get market data with null checking + JObject job = Main.dataBase.marketData.GetValue(name) as JObject; + if (job == null) + { + Main.AddLog($"MARKET DATA: No market data found for '{name}', skipping item"); + foundParts.RemoveAt(i); // remove item with no market data + i--; // Adjust index since we removed an item + resultCount--; + continue; + } + + JObject primeSet = Main.dataBase.marketData.GetValue(primeSetName) as JObject; string plat = job["plat"].ToObject(); string primeSetPlat = null; if (primeSet != null) @@ -706,9 +812,11 @@ internal static void ProcessSnapIt(Bitmap snapItImage, Bitmap fullShot, Point sn } string ducats = job["ducats"].ToObject(); string volume = job["volume"].ToObject(); - bool vaulted = Main.dataBase.IsPartVaulted(name); - bool mastered = Main.dataBase.IsPartMastered(name); - string partsOwned = Main.dataBase.PartsOwned(name); + + bool vaulted = SafeCall(() => Main.dataBase.IsPartVaulted(name), false, "IsPartVaulted", name); + bool mastered = SafeCall(() => Main.dataBase.IsPartMastered(name), false, "IsPartMastered", name); + string partsOwned = SafeCall(() => Main.dataBase.PartsOwned(name), "0", "PartsOwned", name); + string partsDetected = ""+part.Count; if (_settings.SnapitExport) @@ -889,35 +997,44 @@ private static List> DivideSnapZones (Bitmap filteredIm private static List> GetTextWithBoundsFromImage(TesseractEngine engine, Bitmap image, int rectXOffset, int rectYOffset) { - List> data = new List>(); - - - using (var page = engine.Process(image, PageSegMode.SparseText)) + // Use single PSM mode for deterministic results + // SparseText is best for SnapIt: finds text anywhere in the image regardless of layout + var results = new List>(); + + try { - using (var iterator = page.GetIterator()) + using (var page = engine.Process(image, PageSegMode.SparseText)) { - - iterator.Begin(); - do + using (var iterator = page.GetIterator()) { - string currentWord = iterator.GetText(PageIteratorLevel.TextLine); - iterator.TryGetBoundingBox(PageIteratorLevel.TextLine, out Rect tempbounds); - Rectangle bounds = new Rectangle(tempbounds.X1 + rectXOffset, tempbounds.Y1 + rectYOffset, tempbounds.Width, tempbounds.Height); - if (currentWord != null) + iterator.Begin(); + do { - currentWord = RE.Replace(currentWord, "").Trim(); - if (currentWord.Length > 0) - { //word is valid start comparing to others - data.Add(Tuple.Create(currentWord, bounds)); + string currentWord = iterator.GetText(PageIteratorLevel.TextLine); + iterator.TryGetBoundingBox(PageIteratorLevel.TextLine, out Rect tempbounds); + Rectangle bounds = new Rectangle(tempbounds.X1 + rectXOffset, tempbounds.Y1 + rectYOffset, tempbounds.Width, tempbounds.Height); + if (currentWord != null) + { + currentWord = currentWord.Trim(); + if (currentWord.Length > 0) + { + results.Add(Tuple.Create(currentWord, bounds)); + } } } + while (iterator.Next(PageIteratorLevel.TextLine)); } - while (iterator.Next(PageIteratorLevel.TextLine)); } } - return data; + catch (Exception ex) + { + // Log OCR extraction failure for debugging + Main.AddLog($"OCR extraction failed in GetTextWithBoundsFromImage: {ex.Message}\n{ex.ToString()}"); + } + return results; } + /// /// Filters out any group of words and addes them all into a single InventoryItem, containing the found words as well as the bounds within they reside. /// @@ -942,13 +1059,46 @@ private static List FindAllParts(Bitmap filteredImage, Bitmap unf if ( _settings.SnapMultiThreaded) { zones = DivideSnapZones(filteredImage, filteredImageClean, rowHits, colHits); - snapThreads = 4; + // Fallback to single-threaded for large layouts to avoid threading issues + if (zones.Count > 12) // Too many zones means fragmentation is occurring + { + // Dispose existing Bitmaps before replacing zones + foreach (var zone in zones) + { + try + { + zone.Item1?.Dispose(); + } + catch + { + // Ignore disposal errors + } + } + + // Fallback to single-threaded for large layouts to avoid threading issues + zones = new List>(); + zones.Add( Tuple.Create(filteredImageClean, new Rectangle(0, 0, filteredImageClean.Width, filteredImageClean.Height) ) ); + snapThreads = 1; + // Keep the zones but process them single-threaded + } + else if (zones.Count > 8) // Large but reasonable number of zones + { + // Large but reasonable number of zones + snapThreads = 1; + } + else + { + snapThreads = 4; + } } else { zones = new List>(); zones.Add( Tuple.Create(filteredImageClean, new Rectangle(0, 0, filteredImageClean.Width, filteredImageClean.Height) ) ); snapThreads = 1; } + + // Initialize results list early for single zone mode + List results = new List(); Task < List>>[] snapTasks = new Task>>[snapThreads]; for (int i = 0; i < snapThreads; i++) { @@ -956,53 +1106,118 @@ private static List FindAllParts(Bitmap filteredImage, Bitmap unf snapTasks[i] = Task.Factory.StartNew(() => { List> taskResults = new List>(); + int zonesProcessed = 0; for (int j = tempI; j < zones.Count; j += snapThreads) { //process images List> currentResult = GetTextWithBoundsFromImage(_tesseractService.Engines[tempI], zones[j].Item1, zones[j].Item2.X, zones[j].Item2.Y); taskResults.AddRange(currentResult); + zonesProcessed++; } + // Thread processing complete return taskResults; }); } Task.WaitAll(snapTasks); + // Dispose all zone bitmaps after processing is complete (except filteredImageClean which is disposed later) + foreach (var zone in zones) + { + try + { + // Skip disposing filteredImageClean as it's needed by GetItemCounts() and disposed later + if (!ReferenceEquals(zone.Item1, filteredImageClean)) + { + zone.Item1?.Dispose(); + } + } + catch + { + // Ignore disposal errors + } + } + + // Get processor once outside loops for performance + var processor = LanguageProcessorFactory.GetCurrentProcessor(); + for (int threadNum = 0; threadNum < snapThreads; threadNum++) { foreach (Tuple wordResult in snapTasks[threadNum].Result) { - string currentWord = wordResult.Item1; + string currentLine = wordResult.Item1; Rectangle bounds = wordResult.Item2; + + // Split line into individual words for proper filtering + var words = currentLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); + var filteredWords = new List(); + + // Filter individual words as intended + foreach (var word in words) + { + if (processor == null || !processor.ShouldFilterWord(word)) + { + filteredWords.Add(word); + } + else if (word.Length <= 3) + { + numberTooFewCharacters++; + } + } + + // If all words were filtered, skip this line + if (filteredWords.Count == 0) + { + Main.AddLog($"SnapIt: All words filtered from line: \"{currentLine}\""); + continue; + } + + // Reconstruct the filtered line + string currentWord = string.Join(" ", filteredWords); //word is valid start comparing to others - int VerticalPad = bounds.Height/2; - int HorizontalPad = (int)(bounds.Height * _settings.SnapItHorizontalNameMargin); + // CJK text wraps across multiple lines more often, so increase vertical padding + // to ensure multi-line item names get grouped into a single item + int VerticalPad = IsCJKLocale() + ? bounds.Height * 3 / 4 // Moderate padding for CJK multi-line grouping (not full height to avoid cross-item merging) + : bounds.Height / 2; + // Reduce horizontal padding for CJK to prevent cross-item horizontal merging + // CJK item tiles in the SnapIt grid are close together, so large horizontal padding + // causes padded bounds to overlap with adjacent items + double hMargin = IsCJKLocale() + ? Math.Min(_settings.SnapItHorizontalNameMargin, 0.3) // Cap at 0.3 for CJK + : _settings.SnapItHorizontalNameMargin; + int HorizontalPad = (int)(bounds.Height * hMargin); + + var paddedBounds = new Rectangle(bounds.X - HorizontalPad, bounds.Y - VerticalPad, bounds.Width + HorizontalPad * 2, bounds.Height + VerticalPad * 2); //var paddedBounds = new Rectangle(bounds.X - bounds.Height / 3, bounds.Y - bounds.Height / 3, bounds.Width + bounds.Height, bounds.Height + bounds.Height / 2); using (Graphics g = Graphics.FromImage(filteredImage)) { - if (paddedBounds.Height > 50 * _window.ScreenScaling || paddedBounds.Width > 84 * _window.ScreenScaling) + // CJK characters are inherently larger than Latin, so use higher thresholds + // Also CJK 3-char words like 리시버/설계도/槍機/藍圖 are valid item name fragments + bool isCJK = IsCJKLocale(); + int sizeThresholdH = isCJK ? (int)(80 * _window.ScreenScaling) : (int)(50 * _window.ScreenScaling); + int sizeThresholdW = isCJK ? (int)(120 * _window.ScreenScaling) : (int)(84 * _window.ScreenScaling); + int minCharLength = isCJK ? 2 : 3; // CJK packs more info per character + + if (paddedBounds.Height > sizeThresholdH || paddedBounds.Width > sizeThresholdW) { //Determine whether or not the box is too large, false positives in OCR can scan items (such as neuroptics, chassis or systems) as a character(s). - if (currentWord.Length > 3) - { // more than 3 characters in a box too large is likely going to be good, pass it but mark as potentially bad + if (currentWord.Length > minCharLength) + { // enough characters in a box too large is likely going to be good, pass it but mark as potentially bad g.DrawRectangle(orange, paddedBounds); numberTooLargeButEnoughCharacters++; } else { + Main.AddLog($"SnapIt: Rejected oversized box with short text: \"{currentWord}\" (bounds: {paddedBounds.Width}x{paddedBounds.Height})"); g.FillRectangle(red, paddedBounds); numberTooLarge++; continue; } } - else if (currentWord.Length < 2 && _settings.Locale == "en") - { - g.FillRectangle(green, paddedBounds); - numberTooFewCharacters++; - continue; - } else { + // Words already filtered at individual level above g.DrawRectangle(pinkP, paddedBounds); } g.DrawRectangle(greenp, bounds); @@ -1010,10 +1225,25 @@ private static List FindAllParts(Bitmap filteredImage, Bitmap unf } int i = foundItems.Count - 1; + + // Max combined width to prevent merging text from different items in the grid + // Each item tile is roughly 130-140px wide at 1080p; cap at 160px to allow + // multi-line wrapping within one item but prevent cross-item cascading merges + int maxGroupWidth = (int)(160 * _window.ScreenScaling); for (; i >= 0; i--) + { if (foundItems[i].Item2.IntersectsWith(paddedBounds)) - break; + { + // Check if merging would create an unreasonably wide group + int combinedLeft = Math.Min(foundItems[i].Item2.Left, paddedBounds.Left); + int combinedRight = Math.Max(foundItems[i].Item2.Right, paddedBounds.Right); + int combinedWidth = combinedRight - combinedLeft; + if (combinedWidth <= maxGroupWidth) + break; // OK to merge + // else: skip this group, too wide — would merge across items + } + } if (i == -1) { @@ -1037,8 +1267,8 @@ private static List FindAllParts(Bitmap filteredImage, Bitmap unf } } - List results = new List(); - + + // Process item groups foreach( Tuple, Rectangle> itemGroup in foundItems) { //Sort order for component words to appear in. If large height difference, sort vertically. If small height difference, sort horizontally @@ -1059,6 +1289,8 @@ private static List FindAllParts(Bitmap filteredImage, Bitmap unf results.Add(new InventoryItem(name, itemGroup.Item2)); } + // Final results processed + if ( _settings.DoSnapItCount) { GetItemCounts(filteredImage, filteredImageClean, unfilteredImage, results, font); @@ -1174,10 +1406,11 @@ private static void GetItemCounts(Bitmap filteredImage, Bitmap filteredImageClea //set OCR to numbers only - _tesseractService.FirstEngine.SetVariable("tessedit_char_whitelist", "0123456789"); - + try + { + _tesseractService.SetNumbersOnlyMode(); - double widthMultiplier = (_settings.DoCustomNumberBoxWidth ? _settings.SnapItNumberBoxWidth : 0.4); + double widthMultiplier = (_settings.DoCustomNumberBoxWidth ? _settings.SnapItNumberBoxWidth : 0.4); //Process grid system for (int i = 0; i < Rows.Count; i++) { @@ -1525,7 +1758,11 @@ private static void GetItemCounts(Bitmap filteredImage, Bitmap filteredImageClea } //return OCR to any symbols - _tesseractService.FirstEngine.SetVariable("tessedit_char_whitelist", ""); + } + finally + { + _tesseractService.ResetToDefaultMode(); + } } darkCyan.Dispose(); red.Dispose(); @@ -1813,7 +2050,7 @@ private static List FindOwnedItems(Bitmap ProfileImage, string ti //do OCR - _tesseractService.FirstEngine.SetVariable("tessedit_char_whitelist", " ABCDEFGHIJKLMNOPQRSTUVWXYZ&"); + // Using default language-specific whitelist using (var page = _tesseractService.FirstEngine.Process(cloneBitmap, PageSegMode.SingleLine)) { using (var iterator = page.GetIterator()) @@ -1828,7 +2065,6 @@ private static List FindOwnedItems(Bitmap ProfileImage, string ti } } - _tesseractService.FirstEngine.SetVariable("tessedit_char_whitelist", ""); } } if (nextYCounter >= 0) @@ -2013,16 +2249,16 @@ private static List ExtractPartBoxAutomatically(out double scaling, out long start = watch.ElapsedMilliseconds; long beginning = start; - int lineHeight = (int)(pixelRewardLineHeight / 2 * _window.ScreenScaling); + int lineHeight = (int)(GetAdjustedLineHeight() * _window.ScreenScaling); Color clr; int width = _window.Window.Width; int height = _window.Window.Height; int mostWidth = (int)(pixleRewardWidth * _window.ScreenScaling); int mostLeft = (width / 2) - (mostWidth / 2 ); - // Most Top = pixleRewardYDisplay - pixleRewardHeight + pixelRewardLineHeight + // Most Top = pixleRewardYDisplay - pixleRewardHeight + GetAdjustedLineHeight() // (316 - 235 + 44) * 1.1 = 137 - int mostTop = height / 2 - (int)((pixleRewardYDisplay - pixleRewardHeight + pixelRewardLineHeight) * _window.ScreenScaling); + int mostTop = height / 2 - (int)((pixleRewardYDisplay - pixleRewardHeight + GetAdjustedLineHeight()) * _window.ScreenScaling); int mostBot = height / 2 - (int)((pixleRewardYDisplay - pixleRewardHeight) * _window.ScreenScaling * 0.5); //Bitmap postFilter = new Bitmap(mostWidth, mostBot - mostTop); var rectangle = new Rectangle((int)(mostLeft), (int)(mostTop), mostWidth, mostBot - mostTop); @@ -2173,7 +2409,7 @@ private static List ExtractPartBoxAutomatically(out double scaling, out int cropWidth = (int)(pixleRewardWidth * _window.ScreenScaling * highScaling); int cropLeft = (preFilter.Width / 2) - (cropWidth / 2); - int cropTop = height / 2 - (int)((pixleRewardYDisplay - pixleRewardHeight + pixelRewardLineHeight) * _window.ScreenScaling * highScaling); + int cropTop = height / 2 - (int)((pixleRewardYDisplay - pixleRewardHeight + GetAdjustedLineHeight()) * _window.ScreenScaling * highScaling); int cropBot = height / 2 - (int)((pixleRewardYDisplay - pixleRewardHeight) * _window.ScreenScaling * lowScaling); int cropHei = cropBot - cropTop; cropTop -= mostTop; @@ -2296,7 +2532,7 @@ private static List FilterAndSeparatePartsFromPartBox(Bitmap partBox, WF //private static List FilterAndSeparateParts(Bitmap image, WFtheme active) //{ // int width = (int)(pixleRewardWidth * _window.ScreenScaling * uiScaling); - // int lineHeight = (int)(pixelRewardLineHeight * _window.ScreenScaling * uiScaling); + // int lineHeight = (int)(GetAdjustedLineHeight() * _window.ScreenScaling * uiScaling); // int left = (image.Width / 2) - (width / 2); // int top = (image.Height / 2) - (int)(pixleRewardYDisplay * _window.ScreenScaling * uiScaling) + (int)(pixleRewardHeight * _window.ScreenScaling * uiScaling) - lineHeight; @@ -2317,9 +2553,119 @@ private static List FilterAndSeparatePartsFromPartBox(Bitmap partBox, WF public static string GetTextFromImage(Bitmap image, TesseractEngine engine) { string ret = ""; - using (Page page = engine.Process(image)) - ret = page.GetText().Trim(); - return RE.Replace(ret, "").Trim(); + + // Use intelligent PSM selection for better Korean text recognition + // Try modes in order of likelihood, exit early if we get a good result + // For Korean: prioritize single block modes for wrapped multi-line item names + PageSegMode[] preferredModes = { + PageSegMode.SingleBlock, // Best for single items with multi-line wrapping + PageSegMode.SingleColumn // Good for stacked lines in reward slots + // Removed SparseText and Auto to improve performance + }; + + Dictionary modeResults = new Dictionary(); + Dictionary modeScores = new Dictionary(); + + foreach (var mode in preferredModes) + { + try + { + using (Page page = engine.Process(image, mode)) + { + string text = page.GetText().Trim(); + modeResults[mode] = text; + + // Score the result + double score = ScoreTextResult(text, mode); + modeScores[mode] = score; + + // Early exit if we got a very good result (has CJK chars and reasonable length) + if (score > 50 && text.Length > 6 && text.Any(c => + (c >= 0xAC00 && c <= 0xD7AF) || // Korean Hangul + (c >= 0x4E00 && c <= 0x9FFF) || // CJK Unified Ideographs + (c >= 0x3400 && c <= 0x4DBF))) // CJK Extension A + { + ret = text; + break; // Exit early for performance + } + } + } + catch (Exception e) + { + Main.AddLog($"OCR extraction failed in GetTextFromImage: {e.Message}\n{e.ToString()}"); + modeResults[mode] = ""; + modeScores[mode] = 0; + } + } + + // If we didn't exit early, select the best result + if (string.IsNullOrEmpty(ret)) + { + var bestMode = modeScores.OrderByDescending(kvp => kvp.Value).First().Key; + ret = modeResults[bestMode] ?? ""; + } + + // Tesseract now handles character filtering via CharacterWhitelist + return ret.Trim(); + } + /// + /// Scores OCR text results for quality assessment + /// + private static double ScoreTextResult(string text, PageSegMode mode) + { + if (string.IsNullOrEmpty(text)) + return 0; + + double score = 0; + + // Base score for text length + score += Math.Min(text.Length, 100); + + // Korean character detection bonus + int koreanChars = text.Count(c => c >= 0xAC00 && c <= 0xD7AF); + // CJK character detection bonus (Chinese Simplified/Traditional) + int cjkChars = text.Count(c => (c >= 0x4E00 && c <= 0x9FFF) || (c >= 0x3400 && c <= 0x4DBF)); + int nonLatinChars = koreanChars + cjkChars; + if (nonLatinChars > 0) + { + score += 20; // Bonus for CJK text + score += Math.Min(nonLatinChars * 2, 30); // Additional bonus per CJK character + } + + // Line count analysis + string[] lines = text.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries); + score += Math.Min(lines.Length * 5, 25); + + // Mode-specific scoring + if (mode == PageSegMode.SingleBlock) + { + // SingleBlock should work well for single items (up to 3 lines with word wrapping) + if (lines.Length >= 1 && lines.Length <= 3) + score += 20; // Higher bonus for optimal single item blocks + if (nonLatinChars > 0 && lines.Length >= 2) + score += 15; // Extra bonus for multi-line CJK text (wrapped item names) + } + else if (mode == PageSegMode.SingleColumn) + { + // SingleColumn should handle stacked lines well in reward slots + if (lines.Length >= 1 && lines.Length <= 4) + score += 15; // Good for vertically stacked reward text + if (nonLatinChars > 0) + score += 10; // Bonus for CJK text in column layout + } + else if (mode == PageSegMode.SparseText) + { + // SparseText should find multiple distinct text regions + if (lines.Length >= 2) + score += 10; + } + + // Penalty for too much whitespace (indicates poor segmentation) + double whitespaceRatio = (double)text.Count(char.IsWhiteSpace) / text.Length; + if (whitespaceRatio > 0.3) + score -= 10; + + return Math.Max(score, 0); } internal static List SeparatePlayers(Bitmap image, TesseractEngine engine) @@ -2364,7 +2710,8 @@ internal static List SeparatePlayers(Bitmap image, TesseractEngine engin string word = iter.GetText(PageIteratorLevel.Word); if (word != null) { - word = RE.Replace(word, "").Trim(); + // Tesseract now handles character filtering via CharacterWhitelist + word = word.Trim(); if (word.Length > 0) { int topOrBot = outRect.Y1 > (outRect.Height * 3 / 4) ? 0 : 1; @@ -2534,12 +2881,135 @@ public static async Task updateEngineAsync() { await ReloadSemaphore.WaitAsync().ConfigureAwait(false); try { - await Task.Run(() => _tesseractService.ReloadEngines()).ConfigureAwait(false); + if (_tesseractService != null) + await Task.Run(() => _tesseractService.ReloadEngines()).ConfigureAwait(false); + else + Main.AddLog("ERROR: Cannot reload engines - TesseractService is null"); } finally { ReloadSemaphore.Release(); } } + + #region Test Support Methods + + /// + /// Test-only entry point: runs the reward screen OCR pipeline on a screenshot + /// and returns the list of matched part names (English) without any UI side-effects. + /// Requires OCR.Init and Main.dataBase to be initialized. + /// + internal static List ProcessRewardScreenForTest(Bitmap screenshot, IWindowInfoService windowService) + { + var results = new List(); + windowService.UseImage(screenshot); + + List parts; + try + { + parts = ExtractPartBoxAutomatically(out uiScaling, out _, screenshot); + } + catch (Exception e) + { + Main.AddLog("Test ProcessReward: ExtractPartBoxAutomatically failed: " + e.Message); + return results; + } + + int engineCount = Math.Min(parts.Count, _tesseractService.Engines.Length); + string[] checks = new string[parts.Count]; + Task[] tasks = new Task[engineCount]; + for (int i = 0; i < engineCount; i++) + { + int tempI = i; + tasks[i] = Task.Factory.StartNew(() => { checks[tempI] = GetTextFromImage(parts[tempI], _tesseractService.Engines[tempI]); }); + } + Task.WaitAll(tasks); + + // Process remaining parts sequentially if more parts than engines + for (int i = engineCount; i < parts.Count; i++) + { + checks[i] = GetTextFromImage(parts[i], _tesseractService.FirstEngine); + } + + foreach (var p in parts) p.Dispose(); + + var validChecks = checks.Where(s => !string.IsNullOrEmpty(s) && s.Replace(" ", "").Length > 6).ToArray(); + + foreach (var part in validChecks) + { + string correctName = Main.dataBase.GetPartName(part, out int dist, false, out _); + if (dist != 9999 && dist <= GetMaxAllowedLevenshteinDistance(part.Length) && !string.IsNullOrEmpty(correctName)) + { + results.Add(correctName); + } + } + + return results; + } + + /// + /// Test-only entry point: runs the SnapIt OCR pipeline on a screenshot + /// and returns the list of matched part names (English) without any UI side-effects. + /// Requires OCR.Init and Main.dataBase to be initialized. + /// + internal static List ProcessSnapItForTest(Bitmap screenshot, IWindowInfoService windowService) + { + var results = new List(); + windowService.UseImage(screenshot); + + WFtheme theme = GetThemeWeighted(out _, screenshot); + if (theme == WFtheme.UNKNOWN) + { + Main.AddLog("Test SnapIt: Theme detection failed"); + return results; + } + + Bitmap filtered = ScaleUpAndFilter(screenshot, theme, out int[] rowHits, out int[] colHits); + List foundParts = FindAllParts(filtered, screenshot, rowHits, colHits); + filtered.Dispose(); + + foreach (var part in foundParts) + { + if (!PartNameValid(part.Name)) + continue; + + string name = Main.dataBase.GetPartName(part.Name, out int levenDist, false, out bool multipleLowest); + if (levenDist == 9999 || levenDist > GetMaxAllowedLevenshteinDistance(part.Name.Length) || string.IsNullOrEmpty(name)) + continue; + + results.Add(name); + } + + return results; + } + + /// + /// Test-only: initializes OCR for headless test mode with only the required services. + /// + internal static void InitForTest(ITesseractService tesseractService, IReadOnlyApplicationSettings settings, + IWindowInfoService window, IHDRDetectorService hdrDetector) + { + Directory.CreateDirectory(Main.AppPath + @"\Debug"); + _tesseractService = tesseractService; + _soundPlayer = null; + _settings = settings; + _window = window; + _gdiScreenshot = null; + _windowsScreenshot = null; + _hdrDetector = hdrDetector; + + LanguageProcessorFactory.Initialize(settings); + try + { + _tesseractService.Init(); + } + catch (Exception ex) + { + Main.AddLog($"ERROR: Failed to initialize TesseractService in test mode: {ex.Message}"); + _tesseractService = null; + } + } + + #endregion } public struct InventoryItem diff --git a/WFInfo/Properties/AssemblyInfo.cs b/WFInfo/Properties/AssemblyInfo.cs index 48a0e74d..8d6c8cd5 100644 --- a/WFInfo/Properties/AssemblyInfo.cs +++ b/WFInfo/Properties/AssemblyInfo.cs @@ -51,5 +51,5 @@ // You can specify all the values or you can default the Build and Revision Numbers // by using the '*' as shown below: // [assembly: AssemblyVersion("1.0.*")] -[assembly: AssemblyVersion("9.7.1.0")] -[assembly: AssemblyFileVersion("9.7.1.0")] +[assembly: AssemblyVersion("9.8.0.0")] +[assembly: AssemblyFileVersion("9.8.0.0")] diff --git a/WFInfo/Services/TesseractService.cs b/WFInfo/Services/TesseractService.cs index 676d52e6..cd2921e8 100644 --- a/WFInfo/Services/TesseractService.cs +++ b/WFInfo/Services/TesseractService.cs @@ -1,10 +1,12 @@ using System; +using System.Collections.Generic; using System.IO; using System.Linq; using System.Net; using Newtonsoft.Json.Linq; using Tesseract; using WFInfo.Settings; +using WFInfo.LanguageProcessing; namespace WFInfo { @@ -27,11 +29,21 @@ public interface ITesseractService void Init(); void ReloadEngines(); + + /// + /// Sets the FirstEngine to numbers-only mode for item counting + /// + void SetNumbersOnlyMode(); + + /// + /// Resets the FirstEngine to its default language-specific whitelist + /// + void ResetToDefaultMode(); } /// - /// Holds all the TesseractEngine instances and is responsible for loadind/reloading them - /// They are all configured in the same way + /// Holds all TesseractEngine instances and is responsible for loadind/reloading them + /// They are all configured with language-specific character whitelists to reduce noise /// public class TesseractService : ITesseractService { @@ -44,7 +56,7 @@ public class TesseractService : ITesseractService /// public TesseractEngine SecondEngine { get; private set; } /// - /// Engines for parallel processing the reward screen and snapit + /// Engines for parallel processing of reward screen and snapit /// public TesseractEngine[] Engines { get; } = new TesseractEngine[4]; @@ -54,6 +66,12 @@ public class TesseractService : ITesseractService private static readonly string FallbackDataPath = Environment.GetFolderPath(Environment.SpecialFolder.CommonApplicationData) + @"\WFInfo" + @"\tessdata"; private string DataPath; + // Fallback whitelist for unknown locales + private const string DefaultWhitelist = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + + // Numbers-only whitelist for item counting + private const string NumbersOnlyWhitelist = "0123456789"; + public TesseractService() { Directory.CreateDirectory(NormalDataPath); @@ -99,10 +117,67 @@ public TesseractService() SecondEngine = CreateEngine(); } - private TesseractEngine CreateEngine() => new TesseractEngine(DataPath, Locale) + private TesseractEngine CreateEngine() { - DefaultPageSegMode = PageSegMode.SingleBlock - }; + //Main.AddLog($"Creating Tesseract engine for locale: '{Locale}'"); + var engine = new TesseractEngine(DataPath, Locale); + + // Apply universal OCR improvements for all languages + + // This causes crash + //engine.SetVariable("tessedit_reject_mode", "1"); // Reject questionable characters + //engine.SetVariable("textord_heavy_nr", "1"); // Enable heavy noise reduction + + engine.SetVariable("tessedit_zero_rejection", "false"); // Don't force recognition of uncertain characters + engine.SetVariable("tessedit_write_rep_codes", "false"); // Don't write rejection codes + engine.SetVariable("tessedit_write_unlv", "false"); // Don't write UNLV format + engine.SetVariable("tessedit_fix_fuzzy_spaces", "true"); // Fix spacing issues + engine.SetVariable("tessedit_prefer_joined_broken", "false"); // Don't join broken characters + engine.SetVariable("tessedit_font_id", "0"); // Use default font (Tesseract 5+) + + // Dictionary and spacing improvements for UI text + engine.SetVariable("preserve_interword_spaces", "1"); // Preserve spacing for stable output + + // Language model penalties that work across all languages + engine.SetVariable("language_model_penalty_case_ok", "0.1"); // Small penalty for case mismatches + engine.SetVariable("language_model_penalty_case_bad", "0.4"); // Higher penalty for bad case + + // Thresholding parameters for better binarization (Tesseract 5+) + engine.SetVariable("thresholding_method", "0"); // Use default thresholding + engine.SetVariable("thresholding_window_size", "5"); // Smaller window for better noise reduction + + // Apply language-specific optimizations + // CJK languages (Korean, Simplified Chinese, Traditional Chinese) share similar OCR challenges + if (Locale == "ko" || Locale == "zh-hans" || Locale == "zh-hant") + { + // CJK-specific OCR improvements for better character recognition + engine.SetVariable("textord_noise_normratio", "2.0"); // More aggressive noise reduction for CJK + engine.SetVariable("chop_enable", "0"); // Disable character chopping for CJK characters + engine.SetVariable("use_new_state_cost", "1"); // Use new state cost for better CJK recognition + engine.SetVariable("load_system_dawg", "true"); // Enable system dictionary for better text segmentation + engine.SetVariable("load_freq_dawg", "true"); // Enable frequency dictionary for better text segmentation + engine.SetVariable("language_model_penalty_non_dict_word", "0"); // Don't penalize non-dictionary words (item names aren't dictionary words) + engine.SetVariable("user_defined_dpi", "300"); // Improve recognition for scaled/filtered UI text + engine.SetVariable("segment_nonalphabetic_script", "1"); // Better segmentation for non-alphabetic scripts + } + else if (Locale == "en") + { + // Aggressive settings for English to reduce noise + engine.SetVariable("language_model_penalty_non_dict_word", "0.3"); // Penalize non-dictionary words heavily + engine.SetVariable("load_system_dawg", "false"); // Disable system dictionary for better UI text recognition + engine.SetVariable("load_freq_dawg", "false"); // Disable frequency dictionary for better UI text recognition + engine.SetVariable("textord_force_make_prop_words", "true"); // Help with compound words + + } + + // Apply language-specific character whitelist from language processor + var processor = LanguageProcessorFactory.GetProcessor(Locale); + var whitelist = processor?.CharacterWhitelist ?? DefaultWhitelist; + engine.SetVariable("tessedit_char_whitelist", whitelist); + //Main.AddLog($"Tesseract whitelist for '{Locale}': '{whitelist}'"); + + return engine; + } public void Init() { @@ -127,13 +202,41 @@ public void ReloadEngines() SecondEngine?.Dispose(); SecondEngine = CreateEngine(); } + + public void SetNumbersOnlyMode() + { + FirstEngine?.SetVariable("tessedit_char_whitelist", NumbersOnlyWhitelist); + } + + public void ResetToDefaultMode() + { + if (FirstEngine != null) + { + var processor = LanguageProcessorFactory.GetProcessor(Locale); + var whitelist = processor?.CharacterWhitelist ?? DefaultWhitelist; + FirstEngine.SetVariable("tessedit_char_whitelist", whitelist); + } + } private void getLocaleTessdata() { string traineddata_hotlink_prefix = "https://raw.githubusercontent.com/WFCD/WFinfo/libs/tessdata/"; JObject traineddata_checksums = new JObject { {"en", "7af2ad02d11702c7092a5f8dd044d52f"}, - {"ko", "c776744205668b7e76b190cc648765da"} + {"ko", "c776744205668b7e76b190cc648765da"}, + {"fr", "ac0a3da6bf50ed0dab61b46415e82c17"}, + {"uk", "fe1312cbfb602fc179796dbf54ee65fe"}, + {"it", "401cd425084217b224f99c3f55c78518"}, + {"de", "d37aac5fce1c7d8f279a42f076c935d8"}, + {"es", "130215a6355e9ea651f483279271d354"}, + {"pt", "9627fa0ccecdc9dfdb9ac232bbbd744f"}, + {"pl", "33bb3c504011b839cf6e2b689ea68578"}, + //{"tr", "df810a344d6725b2ee3e76682de5a86b"}, - cannot be supported until WFM supports it + {"ru", "2e2022eddce032b754300a8188b41419"}, + //{"ja", "synthetic_md5_japanese"}, - cannot be supported until WFM supports it + {"zh-hans", "921bdf9c27a17ce5c7c77c10345ad8fb"}, + {"zh-hant", "5865dded9ef6d035c165fb14317f1402"}, + //{"th", "synthetic_md5_thai"} - cannot be supported until WFM supports it }; // get trainned data @@ -143,18 +246,33 @@ private void getLocaleTessdata() WebClient webClient = CustomEntrypoint.CreateNewWebClient(); - if (!File.Exists(app_data_traineddata_path) || CustomEntrypoint.GetMD5hash(app_data_traineddata_path) != traineddata_checksums.GetValue(Locale).ToObject()) + // Check if locale is supported before accessing checksums + if (traineddata_checksums.TryGetValue(Locale, out JToken checksumToken)) { - try + string expectedChecksum = checksumToken.ToObject(); + + if (!File.Exists(app_data_traineddata_path) || CustomEntrypoint.GetMD5hash(app_data_traineddata_path) != expectedChecksum) { - webClient.DownloadFile(traineddata_hotlink, app_data_traineddata_path); - // We download to normal data path. If current data path differs, copy it to there too - if (curr_data_traineddata_path != app_data_traineddata_path) + try + { + webClient.DownloadFile(traineddata_hotlink, app_data_traineddata_path); + // We download to normal data path. If current data path differs, copy it to there too + if (curr_data_traineddata_path != app_data_traineddata_path) + { + File.Copy(app_data_traineddata_path, curr_data_traineddata_path, true); + } + } + catch (Exception ex) { - File.Copy(app_data_traineddata_path, curr_data_traineddata_path, true); + Main.AddLog($"Failed to download traineddata for locale '{Locale}': {ex.Message}. Source: {traineddata_hotlink}, Target: {app_data_traineddata_path}"); + // Don't throw during initialization to allow service to continue with existing data } } - catch (Exception) { } + } + else + { + // Unsupported locale - skip download and log warning + Main.AddLog($"Unsupported locale '{Locale}' - no traineddata checksum available, skipping download"); } } } diff --git a/WFInfo/Settings/SettingsWindow.xaml b/WFInfo/Settings/SettingsWindow.xaml index c46c36f4..1fd5de34 100644 --- a/WFInfo/Settings/SettingsWindow.xaml +++ b/WFInfo/Settings/SettingsWindow.xaml @@ -498,10 +498,62 @@ Content="English" FontSize="14" Background="#FF1B1B1B" /> + + + + + + + + + + + + + diff --git a/WFInfo/SnapItOverlay.xaml.cs b/WFInfo/SnapItOverlay.xaml.cs index b208ee36..00f3d8ef 100644 --- a/WFInfo/SnapItOverlay.xaml.cs +++ b/WFInfo/SnapItOverlay.xaml.cs @@ -38,16 +38,33 @@ public SnapItOverlay(IWindowInfoService window) public void Populate(Bitmap screenshot) { + ResetRectangle(); tempImage = screenshot; isEnabled = true; } + private void ResetRectangle() + { + // Reset rectangle properties to ensure it doesn't persist from previous session + rectangle.Width = 0; + rectangle.Height = 0; + rectangle.RenderTransform = new TranslateTransform(0, 0); + rectangle.Visibility = Visibility.Hidden; + + // Keep rectangle as persistent child - don't remove from canvas + } + private void canvas_MouseDown(object sender, MouseButtonEventArgs e) { //Set the start point startDrag = e.GetPosition(canvas); + + // Rectangle is always persistent, just ensure it's visible and on top + rectangle.Visibility = Visibility.Visible; + //Move the selection marquee on top of all other objects in canvas Canvas.SetZIndex(rectangle, canvas.Children.Count); + //Capture the mouse if (!canvas.IsMouseCaptured) canvas.CaptureMouse(); @@ -56,23 +73,12 @@ private void canvas_MouseDown(object sender, MouseButtonEventArgs e) public void closeOverlay() { - rectangle.Width = 0; - rectangle.Height = 0; - rectangle.RenderTransform = new TranslateTransform(0, 0); + ResetRectangle(); Topmost = false; isEnabled = false; - // THIS FUCKING RECTANGLE WOULDN'T GO AWAY - // AND IT WOULD STAY FOR 1 FRAME WHEN RE-OPENNING THIS WINDOW - // SO I FORCED THAT FRAME TO HAPPEN BEFORE CLOSING - // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHHHHHHHHHHH - // - // fucking hate rectangles - Task.Factory.StartNew(async () => - { - await Task.Delay(100); - Dispatcher.Invoke(Hide); - }); + // Force immediate hide without delay to prevent rectangle persistence + Hide(); } private void canvas_MouseUp(object sender, MouseButtonEventArgs e) diff --git a/WFInfo/Tests/OCRTestRunner.cs b/WFInfo/Tests/OCRTestRunner.cs new file mode 100644 index 00000000..7f2e8112 --- /dev/null +++ b/WFInfo/Tests/OCRTestRunner.cs @@ -0,0 +1,396 @@ +using Newtonsoft.Json; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Drawing; +using System.IO; +using System.Linq; +using WFInfo.Settings; +using WFInfo.Services.HDRDetection; +using WFInfo.Services.WindowInfo; + +namespace WFInfo.Tests +{ + /// + /// OCR regression test runner that calls real WFInfo OCR methods directly. + /// Requires OCR.InitForTest and Main.dataBase to be initialized before use. + /// + public class OCRTestRunner + { + private readonly IWindowInfoService _windowService; + private string _currentLocale; + private bool _currentHDR; + + public OCRTestRunner(IWindowInfoService windowService) + { + _windowService = windowService; + } + + public TestSuiteResult RunTestSuite(string testMapPath) + { + var result = new TestSuiteResult + { + TestSuiteName = Path.GetFileNameWithoutExtension(testMapPath), + StartTime = DateTime.UtcNow + }; + + try + { + var testMapJson = File.ReadAllText(testMapPath); + var testMap = JsonConvert.DeserializeObject(testMapJson); + + if (testMap == null) + { + Main.AddLog($"Failed to deserialize TestMap from '{testMapPath}' - deserialization returned null"); + throw new InvalidDataException($"TestMap deserialization failed for file: {testMapPath}"); + } + + if (testMap.Scenarios == null || testMap.Scenarios.Count == 0) + { + Main.AddLog($"TestMap from '{testMapPath}' contains no scenarios"); + throw new InvalidDataException($"TestMap contains no scenarios: {testMapPath}"); + } + + string testMapDir = Path.GetDirectoryName(Path.GetFullPath(testMapPath)); + + Main.AddLog($"Starting test suite: {result.TestSuiteName} with {testMap.Scenarios.Count} scenario(s)"); + + foreach (var scenario in testMap.Scenarios) + { + var testResult = RunSingleTest(scenario, testMapDir); + result.TestResults.Add(testResult); + } + + CalculateStatistics(result); + result.EndTime = DateTime.UtcNow; + + Main.AddLog($"Test suite completed: {result.PassedTests}/{result.TotalTests} passed ({result.PassRate:F1}%), accuracy {result.OverallAccuracy:F1}%"); + } + catch (Exception ex) + { + Main.AddLog($"Test suite failed: {ex.Message}\n{ex.StackTrace}"); + result.EndTime = DateTime.UtcNow; + result.ErrorMessage = ex.Message; + } + + return result; + } + + private TestResult RunSingleTest(string scenarioPath, string testMapDir) + { + var stopwatch = Stopwatch.StartNew(); + + // Resolve paths relative to the map.json directory with traversal protection + string baseDir = Path.GetFullPath(testMapDir); + string jsonFull = Path.GetFullPath(Path.Combine(baseDir, scenarioPath + ".json")); + string imageFull = Path.GetFullPath(Path.Combine(baseDir, scenarioPath + ".png")); + + // Verify paths don't escape the base directory (case-insensitive on Windows) + if (!jsonFull.Equals(baseDir, StringComparison.OrdinalIgnoreCase) && + !jsonFull.StartsWith(baseDir + Path.DirectorySeparatorChar, StringComparison.OrdinalIgnoreCase)) + { + throw new Exception($"Path traversal detected for JSON file: {scenarioPath}"); + } + + if (!imageFull.Equals(baseDir, StringComparison.OrdinalIgnoreCase) && + !imageFull.StartsWith(baseDir + Path.DirectorySeparatorChar, StringComparison.OrdinalIgnoreCase)) + { + throw new Exception($"Path traversal detected for image file: {scenarioPath}"); + } + + string jsonPath = jsonFull; + string imagePath = imageFull; + string testName = Path.GetFileName(scenarioPath); + + var result = new TestResult + { + TestCaseName = testName, + ImagePath = imagePath + }; + + try + { + // Validate files exist + if (!File.Exists(jsonPath)) + { + result.ErrorMessage = $"JSON not found: {jsonPath}"; + result.Success = false; + stopwatch.Stop(); + result.ProcessingTimeMs = stopwatch.ElapsedMilliseconds; + return result; + } + + if (!File.Exists(imagePath)) + { + result.ErrorMessage = $"PNG not found: {imagePath}"; + result.Success = false; + stopwatch.Stop(); + result.ProcessingTimeMs = stopwatch.ElapsedMilliseconds; + return result; + } + + // Load spec + var testCase = JsonConvert.DeserializeObject(File.ReadAllText(jsonPath)); + result.Language = testCase.Language ?? "unknown"; + result.Theme = testCase.Theme ?? "auto"; + result.Category = testCase.Category ?? "reward"; + result.ExpectedParts = testCase.Parts?.Values.ToList() ?? new List(); + + Main.AddLog($"Running: {testName} [{result.Language}/{result.Category}/{result.Theme}] expecting {result.ExpectedParts.Count} part(s)"); + + // Configure settings for this test + ApplyTestSettings(testCase); + + // Run real OCR pipeline + using (var bitmap = new Bitmap(imagePath)) + { + List ocrResults; + switch (result.Category.ToLower()) + { + case "snapit": + ocrResults = OCR.ProcessSnapItForTest(bitmap, _windowService); + break; + case "reward": + default: + ocrResults = OCR.ProcessRewardScreenForTest(bitmap, _windowService); + break; + } + + result.ActualParts = ocrResults; + } + + // Compare expected vs actual + CompareResults(result); + + stopwatch.Stop(); + result.ProcessingTimeMs = stopwatch.ElapsedMilliseconds; + + string status = result.Success ? "PASS" : "FAIL"; + Main.AddLog($" {status}: {testName} ({result.AccuracyScore:F0}% accuracy, {result.ProcessingTimeMs}ms) actual=[{string.Join(", ", result.ActualParts)}]"); + } + catch (Exception ex) + { + stopwatch.Stop(); + result.ProcessingTimeMs = stopwatch.ElapsedMilliseconds; + result.ErrorMessage = ex.Message; + result.Success = false; + Main.AddLog($" ERROR: {testName}: {ex.Message}"); + } + + return result; + } + + private void ApplyTestSettings(TestCase testCase) + { + var settings = ApplicationSettings.GlobalSettings; + + // Map language name to locale code + string newLocale = MapLanguageToLocale(testCase.Language); + bool localeChanged = newLocale != _currentLocale; + bool hdrChanged = testCase.HDR != _currentHDR; + settings.Locale = newLocale; + _currentLocale = newLocale; + _currentHDR = testCase.HDR; + + // Map theme name to enum + settings.ThemeSelection = MapThemeToEnum(testCase.Theme); + + // Apply scaling + if (testCase.Scaling > 0) + OCR.uiScaling = testCase.Scaling / 100.0; + + // Reload engines if language changed (different tessdata) or HDR setting changed + if (localeChanged || hdrChanged) + { + string reason = localeChanged ? $"Locale changed to '{newLocale}'" : $"HDR changed to '{testCase.HDR}'"; + Main.AddLog($" {reason}, reinitializing OCR engines..."); + OCR.InitForTest( + new TesseractService(), + ApplicationSettings.GlobalReadonlySettings, + _windowService, + new HeadlessHDRDetector(testCase.HDR)); + + // Also re-update Data so Levenshtein uses the right locale for matching (only when locale changes) + if (localeChanged) + { + Main.dataBase.ReloadItems().GetAwaiter().GetResult(); + } + } + } + + private static string MapLanguageToLocale(string language) + { + if (string.IsNullOrEmpty(language)) return "en"; + switch (language.ToLower()) + { + case "english": return "en"; + case "korean": return "ko"; + case "japanese": return "ja"; + case "simplified chinese": return "zh-hans"; + case "traditional chinese": return "zh-hant"; + case "thai": return "th"; + case "french": return "fr"; + case "ukrainian": return "uk"; + case "italian": return "it"; + case "german": return "de"; + case "spanish": return "es"; + case "portuguese": return "pt"; + case "polish": return "pl"; + case "turkish": return "tr"; + case "russian": return "ru"; + default: return "en"; + } + } + + private static WFtheme MapThemeToEnum(string theme) + { + if (string.IsNullOrEmpty(theme)) return WFtheme.AUTO; + switch (theme.ToLower()) + { + case "orokin": return WFtheme.OROKIN; + case "tenno": return WFtheme.TENNO; + case "grineer": return WFtheme.GRINEER; + case "corpus": return WFtheme.CORPUS; + case "infested": return WFtheme.NIDUS; + case "lotus": return WFtheme.LOTUS; + case "fortuna": return WFtheme.FORTUNA; + case "baruuk": return WFtheme.BARUUK; + case "equinox": return WFtheme.EQUINOX; + case "dark lotus": case "dark_lotus": return WFtheme.DARK_LOTUS; + case "zephyr": return WFtheme.ZEPHYR; + case "high contrast": case "high_contrast": return WFtheme.HIGH_CONTRAST; + case "legacy": return WFtheme.LEGACY; + default: return WFtheme.AUTO; + } + } + + private static void CompareResults(TestResult result) + { + // Count occurrences for multiset comparison + var expectedCounts = new Dictionary(StringComparer.OrdinalIgnoreCase); + var actualCounts = new Dictionary(StringComparer.OrdinalIgnoreCase); + + foreach (var exp in result.ExpectedParts) + { + expectedCounts[exp] = expectedCounts.TryGetValue(exp, out int count) ? count + 1 : 1; + } + + foreach (var act in result.ActualParts) + { + actualCounts[act] = actualCounts.TryGetValue(act, out int count) ? count + 1 : 1; + } + + // Find missing parts (expected count > actual count) + foreach (var kvp in expectedCounts) + { + int expectedCount = kvp.Value; + int actualCount = actualCounts.TryGetValue(kvp.Key, out int count) ? count : 0; + + if (actualCount < expectedCount) + { + for (int i = 0; i < expectedCount - actualCount; i++) + { + result.MissingParts.Add(kvp.Key); + } + } + } + + // Find extra parts (actual count > expected count) + foreach (var kvp in actualCounts) + { + int actualCount = kvp.Value; + int expectedCount = expectedCounts.TryGetValue(kvp.Key, out int count) ? count : 0; + + if (actualCount > expectedCount) + { + for (int i = 0; i < actualCount - expectedCount; i++) + { + result.ExtraParts.Add(kvp.Key); + } + } + } + + // Calculate accuracy based on matched items + int totalExpected = result.ExpectedParts.Count; + int matched = 0; + foreach (var kvp in expectedCounts) + { + int expectedCount = kvp.Value; + int actualCount = actualCounts.TryGetValue(kvp.Key, out int count) ? count : 0; + matched += Math.Min(expectedCount, actualCount); + } + + result.AccuracyScore = totalExpected > 0 ? (double)matched / totalExpected * 100.0 : 0; + result.Success = result.MissingParts.Count == 0 && result.ExtraParts.Count == 0 && string.IsNullOrEmpty(result.ErrorMessage); + } + + private static void CalculateStatistics(TestSuiteResult suite) + { + suite.TotalTests = suite.TestResults.Count; + suite.PassedTests = suite.TestResults.Count(t => t.Success); + suite.FailedTests = suite.TestResults.Count(t => !t.Success && t.ErrorMessage == null); + suite.ErrorTests = suite.TestResults.Count(t => t.ErrorMessage != null && !t.Success); + suite.OverallAccuracy = suite.TestResults.Count > 0 ? suite.TestResults.Average(t => t.AccuracyScore) : 0; + suite.PassRate = suite.TotalTests > 0 ? (double)suite.PassedTests / suite.TotalTests * 100 : 0; + + // Category coverage + foreach (var group in suite.TestResults.GroupBy(t => t.Category ?? "unknown")) + { + suite.CategoryCoverage[group.Key] = BuildCoverage(group); + } + + // Language coverage + foreach (var group in suite.TestResults.GroupBy(t => t.Language ?? "unknown")) + { + suite.LanguageCoverage[group.Key] = BuildCoverage(group); + } + + // Overall coverage + suite.OverallCoverage = new TestCoverage + { + TotalTests = suite.TotalTests, + PassedTests = suite.PassedTests, + FailedTests = suite.FailedTests, + PassRate = suite.PassRate, + AverageAccuracy = suite.OverallAccuracy, + AverageProcessingTime = suite.TestResults.Count > 0 ? suite.TestResults.Average(t => t.ProcessingTimeMs) : 0 + }; + } + + private static TestCoverage BuildCoverage(IGrouping group) + { + return new TestCoverage + { + TotalTests = group.Count(), + PassedTests = group.Count(t => t.Success), + FailedTests = group.Count(t => !t.Success), + PassRate = group.Count() > 0 ? (double)group.Count(t => t.Success) / group.Count() * 100 : 0, + AverageAccuracy = group.Average(t => t.AccuracyScore), + AverageProcessingTime = group.Average(t => t.ProcessingTimeMs) + }; + } + + public static void SaveResults(TestSuiteResult results, string outputPath) + { + try + { + var json = JsonConvert.SerializeObject(results, Formatting.Indented); + File.WriteAllText(outputPath, json); + Main.AddLog($"Test results saved to: {outputPath}"); + } + catch (Exception ex) + { + Main.AddLog($"Failed to save results: {ex.Message}"); + } + } + } + + /// + /// Headless HDR detector that returns a fixed value for testing. + /// + internal class HeadlessHDRDetector : IHDRDetectorService + { + public bool IsHDR { get; } + public HeadlessHDRDetector(bool isHdr) { IsHDR = isHdr; } + } +} diff --git a/WFInfo/Tests/TestModels.cs b/WFInfo/Tests/TestModels.cs new file mode 100644 index 00000000..8809899d --- /dev/null +++ b/WFInfo/Tests/TestModels.cs @@ -0,0 +1,87 @@ +using Newtonsoft.Json; +using System; +using System.Collections.Generic; + +namespace WFInfo.Tests +{ + public class TestCase + { + [JsonProperty("description")] + public string Description { get; set; } + + [JsonProperty("resolution")] + public string Resolution { get; set; } + + [JsonProperty("scaling")] + public int Scaling { get; set; } + + [JsonProperty("theme")] + public string Theme { get; set; } + + [JsonProperty("language")] + public string Language { get; set; } + + [JsonProperty("parts")] + public Dictionary Parts { get; set; } + + [JsonProperty("category")] + public string Category { get; set; } + + [JsonProperty("hdr")] + public bool HDR { get; set; } + + [JsonProperty("filters")] + public List Filters { get; set; } + } + + public class TestMap + { + [JsonProperty("scenarios")] + public List Scenarios { get; set; } = new List(); + } + + public class TestResult + { + public string TestCaseName { get; set; } + public string ImagePath { get; set; } + public string Language { get; set; } + public string Theme { get; set; } + public string Category { get; set; } + public bool Success { get; set; } + public List ExpectedParts { get; set; } = new List(); + public List ActualParts { get; set; } = new List(); + public List MissingParts { get; set; } = new List(); + public List ExtraParts { get; set; } = new List(); + public double AccuracyScore { get; set; } + public long ProcessingTimeMs { get; set; } + public string ErrorMessage { get; set; } + } + + public class TestSuiteResult + { + public string TestSuiteName { get; set; } + public DateTime StartTime { get; set; } + public DateTime EndTime { get; set; } + public List TestResults { get; set; } = new List(); + public int TotalTests { get; set; } + public int PassedTests { get; set; } + public int FailedTests { get; set; } + public int ErrorTests { get; set; } + public double OverallAccuracy { get; set; } + public double PassRate { get; set; } + public Dictionary CategoryCoverage { get; set; } = new Dictionary(); + public Dictionary LanguageCoverage { get; set; } = new Dictionary(); + public TestCoverage OverallCoverage { get; set; } + public string ErrorMessage { get; set; } + } + + public class TestCoverage + { + public int TotalTests { get; set; } + public int PassedTests { get; set; } + public int FailedTests { get; set; } + public double PassRate { get; set; } + public double AverageAccuracy { get; set; } + public double AverageProcessingTime { get; set; } + } +} diff --git a/WFInfo/Tests/TestProgram.cs b/WFInfo/Tests/TestProgram.cs new file mode 100644 index 00000000..0cbe188d --- /dev/null +++ b/WFInfo/Tests/TestProgram.cs @@ -0,0 +1,187 @@ +using System; +using System.Diagnostics; +using System.IO; +using System.Runtime.InteropServices; +using System.Threading.Tasks; +using WFInfo.Settings; +using WFInfo.Services.WarframeProcess; +using WFInfo.Services.WindowInfo; + +namespace WFInfo.Tests +{ + /// + /// Headless entry point for OCR regression tests. + /// Initializes real WFInfo services (Tesseract, Data, WindowInfo) without WPF UI. + /// + public static class TestProgram + { + public static async Task RunTests(string[] args) + { + if (args.Length < 1) + { + PrintUsage(); + Environment.ExitCode = 1; + return; + } + + string testMapPath = args[0]; + string outputPath = args.Length > 1 ? args[1] : $"test_results_{DateTime.Now:yyyyMMdd_HHmmss}.json"; + + Console.WriteLine($"Map: {Path.GetFullPath(testMapPath)}"); + Console.WriteLine($"Output: {Path.GetFullPath(outputPath)}"); + Console.WriteLine(); + + if (!File.Exists(testMapPath)) + { + Console.Error.WriteLine($"ERROR: map file not found: {testMapPath}"); + Environment.ExitCode = 2; + return; + } + + try + { + // --- Initialize real services headlessly --- + Console.WriteLine("Initializing services..."); + + var settings = ApplicationSettings.GlobalSettings; + settings.Debug = true; // Enable debug mode so window info works without a game process + + var processFinder = new HeadlessProcessFinder(); + var windowService = new Win32WindowInfoService(processFinder, ApplicationSettings.GlobalReadonlySettings); + + // Initialize Data (downloads/loads market data, name data, etc.) + Main.dataBase = new Data(ApplicationSettings.GlobalReadonlySettings, processFinder, windowService); + Console.WriteLine("Updating databases (this may take a moment on first run)..."); + await Main.dataBase.Update(); + Console.WriteLine("Databases ready."); + + // Initialize OCR with real TesseractService + OCR.InitForTest( + new TesseractService(), + ApplicationSettings.GlobalReadonlySettings, + windowService, + new HeadlessHDRDetector(false)); + Console.WriteLine("OCR engine ready."); + Console.WriteLine(); + + // --- Run tests --- + var runner = new OCRTestRunner(windowService); + var results = runner.RunTestSuite(testMapPath); + + // --- Save & report --- + OCRTestRunner.SaveResults(results, outputPath); + PrintSummary(results); + + Console.WriteLine(); + Console.WriteLine($"Results saved to: {Path.GetFullPath(outputPath)}"); + + // Exit code: 0 = all pass, 1 = some fail, 2 = error + if (!string.IsNullOrEmpty(results.ErrorMessage)) + Environment.ExitCode = 2; + else if (results.FailedTests > 0 || results.ErrorTests > 0) + Environment.ExitCode = 1; + else + Environment.ExitCode = 0; + } + catch (Exception ex) + { + Console.Error.WriteLine($"FATAL: {ex.Message}"); + Console.Error.WriteLine(ex.StackTrace); + Environment.ExitCode = 2; + } + } + + private static void PrintUsage() + { + Console.WriteLine("Usage: WFInfo.exe [output.json]"); + Console.WriteLine(); + Console.WriteLine(" map.json - Test map file listing scenario paths"); + Console.WriteLine(" output.json - (optional) Output results file"); + Console.WriteLine(); + Console.WriteLine("Each scenario is a pair of files relative to map.json:"); + Console.WriteLine(" data/test1.json - Test spec (language, theme, expected parts, ...)"); + Console.WriteLine(" data/test1.png - Screenshot to OCR"); + Console.WriteLine(); + Console.WriteLine("Example map.json:"); + Console.WriteLine(" { \"scenarios\": [\"data/test1\", \"data/test2\"] }"); + } + + private static void PrintSummary(TestSuiteResult results) + { + Console.WriteLine(); + Console.WriteLine("========================================"); + Console.WriteLine(" TEST RESULTS SUMMARY"); + Console.WriteLine("========================================"); + Console.WriteLine($" Suite: {results.TestSuiteName}"); + Console.WriteLine($" Total: {results.TotalTests}"); + Console.WriteLine($" Passed: {results.PassedTests}"); + Console.WriteLine($" Failed: {results.FailedTests}"); + if (results.ErrorTests > 0) + Console.WriteLine($" Errors: {results.ErrorTests}"); + Console.WriteLine($" Pass Rate: {results.PassRate:F1}%"); + Console.WriteLine($" Accuracy: {results.OverallAccuracy:F1}%"); + Console.WriteLine($" Duration: {(results.EndTime - results.StartTime).TotalSeconds:F1}s"); + + if (results.LanguageCoverage.Count > 0) + { + Console.WriteLine(); + Console.WriteLine(" By Language:"); + foreach (var kv in results.LanguageCoverage) + { + var c = kv.Value; + Console.WriteLine($" {kv.Key,-20} {c.PassedTests}/{c.TotalTests} pass {c.AverageAccuracy:F0}% acc {c.AverageProcessingTime:F0}ms avg"); + } + } + + if (results.CategoryCoverage.Count > 0) + { + Console.WriteLine(); + Console.WriteLine(" By Category:"); + foreach (var kv in results.CategoryCoverage) + { + var c = kv.Value; + Console.WriteLine($" {kv.Key,-20} {c.PassedTests}/{c.TotalTests} pass {c.AverageAccuracy:F0}% acc {c.AverageProcessingTime:F0}ms avg"); + } + } + + // Print failed/error test details + var problems = results.TestResults.FindAll(t => !t.Success); + if (problems.Count > 0) + { + Console.WriteLine(); + Console.WriteLine(" Failed/Error Details:"); + foreach (var t in problems) + { + if (!string.IsNullOrEmpty(t.ErrorMessage)) + { + Console.WriteLine($" ERROR {t.TestCaseName}: {t.ErrorMessage}"); + } + else + { + Console.WriteLine($" FAIL {t.TestCaseName} ({t.AccuracyScore:F0}% accuracy)"); + if (t.MissingParts.Count > 0) + Console.WriteLine($" Missing: {string.Join(", ", t.MissingParts)}"); + if (t.ExtraParts.Count > 0) + Console.WriteLine($" Extra: {string.Join(", ", t.ExtraParts)}"); + if (t.ActualParts.Count > 0) + Console.WriteLine($" Got: {string.Join(", ", t.ActualParts)}"); + } + } + } + + Console.WriteLine("========================================"); + } + } + + /// + /// Headless process finder that reports no running game process. + /// + internal class HeadlessProcessFinder : IProcessFinder + { + public Process Warframe => null; + public HandleRef HandleRef => default; + public bool IsRunning => false; + public bool GameIsStreamed => false; + public event ProcessChangedArgs OnProcessChanged { add { } remove { } } + } +} diff --git a/WFInfo/WFInfo.csproj b/WFInfo/WFInfo.csproj index 288b9a56..5dc31b51 100644 --- a/WFInfo/WFInfo.csproj +++ b/WFInfo/WFInfo.csproj @@ -45,6 +45,11 @@ embedded True + + + + 9.8.0.0 + @@ -125,4 +130,47 @@ Never + + + + + $(OutputPath)update.xml + + + + + + + + + + + + + + + + + $(OutputPath)WFInfo.exe + $(OutputPath)WFInfo.zip + $(OutputPath)temp\ + $(TempDir)WFInfo.exe + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/WFInfo/errorDialogue.xaml.cs b/WFInfo/errorDialogue.xaml.cs index 1491128e..f9afdd75 100644 --- a/WFInfo/errorDialogue.xaml.cs +++ b/WFInfo/errorDialogue.xaml.cs @@ -43,29 +43,59 @@ public void YesClick(object sender, RoutedEventArgs e) try { - var filePathsToCheck = new List - { - startPath + @"\..\eqmt_data.json", - startPath + @"\..\market_data.json", - startPath + @"\..\market_items.json", - startPath + @"\..\name_data.json", - startPath + @"\..\relic_data.json", - startPath + @"\..\settings.json", - startPath + @"\..\debug.log" - }; var fullZipPath = zipPath + @"\WFInfoError_" + closest.ToString("yyyy-MM-dd_HH-mm-ssff") + ".zip"; using (ZipFile zip = new ZipFile()) { - filePathsToCheck.Where( - path => File.Exists(path) - ).ToList().Concat( - files.Select( - file => file.FullName - ) - ).ToList().ForEach( - filename => zip.AddFile(filename, "") - ); + // Priority files: debug.log and settings JSON files + string parentDir = Path.GetDirectoryName(startPath); + var priorityFiles = new[] + { + Path.Combine(parentDir, "debug.log"), + Path.Combine(parentDir, "settings.json") + }; + + // Other data files + var otherDataFiles = new[] + { + Path.Combine(parentDir, "eqmt_data.json"), + Path.Combine(parentDir, "market_data.json"), + Path.Combine(parentDir, "market_items.json"), + Path.Combine(parentDir, "name_data.json"), + Path.Combine(parentDir, "relic_data.json") + }; + + // Add debug folder files first (will end up in later segments) + // Filter out files that would collide with priorityFiles and otherDataFiles + var priorityFileNames = priorityFiles.Select(Path.GetFileName).ToHashSet(StringComparer.OrdinalIgnoreCase); + var otherDataFileNames = otherDataFiles.Select(Path.GetFileName).ToHashSet(StringComparer.OrdinalIgnoreCase); + + foreach (FileInfo file in files) + { + string fileName = Path.GetFileName(file.FullName); + if (!priorityFileNames.Contains(fileName) && !otherDataFileNames.Contains(fileName)) + { + zip.AddFile(file.FullName, ""); + } + } + + // Add other data files next + foreach (string path in otherDataFiles) + { + if (File.Exists(path)) + { + zip.AddFile(path, ""); + } + } + + // Add priority files last (will end up in first segment .z01) + foreach (string path in priorityFiles) + { + if (File.Exists(path)) + { + zip.AddFile(path, ""); + } + } zip.MaxOutputSegmentSize64 = segmentSize; // 8m segments zip.Save(fullZipPath); diff --git a/tests/BUILD_INSTRUCTIONS.md b/tests/BUILD_INSTRUCTIONS.md new file mode 100644 index 00000000..f2bf1c53 --- /dev/null +++ b/tests/BUILD_INSTRUCTIONS.md @@ -0,0 +1,172 @@ +# Building WFInfo Test Framework + +## 🎯 Current Architecture + +The test framework is **embedded within the main WFInfo project**, not a separate executable. Here's how to build and run it: + +## 📁 Project Structure + +``` +WFInfo/ +├── WFInfo.csproj # Main project (includes tests) +├── Tests/ # Test framework code +│ ├── TestModels.cs # Test data models +│ ├── OCRTestRunner.cs # Test execution logic +│ └── TestProgram.cs # Console entry point +└── tests/ # Test data and scripts + ├── map.json # Test scenarios + ├── data/ # External test data + └── run_tests.bat # Batch script +``` + +## 🔧 Building the Test Framework + +### **Option 1: Build Main Project** +```bash +# Navigate to WFInfo root +cd \WFinfo + +# Build the main project (includes test framework) +dotnet build --configuration Release + +# The executable will be at: +# bin\Release\net48\WFInfo.exe +``` + +### **Option 2: Build with Visual Studio** +1. Open `WFInfo.sln` in Visual Studio +2. Set configuration to **Release** +3. Build solution (**Ctrl+Shift+B**) +4. Executable: `bin\Release\net48\WFInfo.exe` + +### **Option 3: Create Separate Test Project** +If you want a dedicated test executable: + +```bash +# Create new test project +dotnet new console -n WFInfo.Tests -f net48 + +# Copy test files to new project +# Copy Tests/ folder to WFInfo.Tests/ +# Add necessary references to WFInfo.Tests.csproj +``` + +## 🚀 Running Tests + +### **Using the Main Executable:** +```bash +# Navigate to tests directory +cd \WFinfo\tests + +# Run tests using main WFInfo executable +..\bin\Release\net48\WFInfo.exe map.json data/ results.json +``` + +### **Using the Batch Script:** +```bash +# Update run_tests.bat to use correct path +# Change line 33 from: +..\WFInfo.Tests.exe map.json %TEST_IMAGES_DIR% test_results_... +# To: +..\bin\Release\net48\WFInfo.exe map.json %TEST_IMAGES_DIR% test_results_... +``` + +## 📝 Updated run_tests.bat + +Here's the corrected batch script: + +```batch +@echo off +setlocal enabledelayedexpansion + +echo WFInfo OCR Test Runner +echo ======================== +echo. + +REM Check if map.json exists +if not exist "map.json" ( + echo ERROR: map.json not found in current directory + echo. + echo Usage: run_tests.bat [test_images_directory] + echo. + echo Example: run_tests.bat data\ + goto :eof +) + +REM Set test images directory +set TEST_IMAGES_DIR=%1 +if "%TEST_IMAGES_DIR%"=="" set TEST_IMAGES_DIR=data + +REM Check if test images directory exists +if not exist "%TEST_IMAGES_DIR%" ( + echo ERROR: Test images directory not found: %TEST_IMAGES_DIR% + goto :eof +) + +REM Run test +echo Running OCR tests... +echo Map: map.json +echo Images: %TEST_IMAGES_DIR% +echo Output: test_results_%date:~-4,4%%date:~-10,2%%date:~-7,2%_%time:~0,2%%time:~3,2%%time:~6,2%.json +echo. + +REM Run test executable (using main WFInfo executable) +..\bin\Release\net48\WFInfo.exe map.json %TEST_IMAGES_DIR% test_results_%date:~-4,4%%date:~-10,2%%date:~-7,2%_%time:~0,2%%time:~3,2%%time:~6,2%.json + +REM Check results +if %errorlevel% equ 0 ( + echo. + echo SUCCESS: All tests passed! +) else if %errorlevel% equ 1 ( + echo. + echo WARNING: Some tests failed (exit code 1) +) else ( + echo. + echo ERROR: Test execution failed (exit code %errorlevel%) +) + +echo. +echo Test completed. Check JSON results file for detailed information. +pause +``` + +## 🎯 Quick Start + +1. **Build the main project:** + ```bash + cd \WFinfo + dotnet build --configuration Release + ``` + +2. **Run tests:** + ```bash + cd \WFinfo\tests + ..\bin\Release\net48\WFInfo.exe map.json data/ results.json + ``` + +3. **Or use the batch script:** + ```bash + cd \WFinfo\tests + run_tests.bat data\ + ``` + +## 📊 Test Framework Features + +The test framework provides: +- **External Data Loading**: `{scenario}.json` + `{scenario}.png` pairs +- **Multi-Language Support**: All 15 supported languages +- **Coverage Metrics**: Pass rates, accuracy, processing times +- **Theme Testing**: All WFInfo themes supported +- **HDR Support**: Test with/without HDR +- **Filter Testing**: Accessibility filter validation +- **Comprehensive Reporting**: JSON output with detailed metrics + +## 🚀 Next Steps + +For a dedicated test executable, consider: +1. Creating separate `WFInfo.Tests` project +2. Moving test code to separate solution +3. Adding proper test project dependencies +4. Building as standalone console application + +But for now, the **embedded approach works perfectly** for comprehensive OCR testing! 🎯 diff --git a/tests/COVERAGE_FEATURES.md b/tests/COVERAGE_FEATURES.md new file mode 100644 index 00000000..3a47d880 --- /dev/null +++ b/tests/COVERAGE_FEATURES.md @@ -0,0 +1,150 @@ +# OCR Test Framework Coverage Features + +## 🎯 New Coverage Metrics Added + +### **1. Pass Rate Tracking** +```csharp +public double PassRate { get; set; } // Overall test pass percentage +``` +- Shows percentage of tests that passed (50%+ considered passing) +- Clear success/failure ratio for quality assessment + +### **2. Category Coverage Analysis** +```csharp +public Dictionary CategoryCoverage { get; set; } +``` +- **Reward Tests**: Pass rate, accuracy, processing time +- **Inventory Tests**: Profile and inventory screen performance +- **SnapIt Tests**: Manual scanning functionality results + +### **3. Language Coverage Analysis** +```csharp +public Dictionary LanguageCoverage { get; set; } +``` +- **Per-Language Metrics**: Pass rate, accuracy, processing time +- **Performance Analysis**: Which languages perform best/worst +- **Regression Detection**: Language-specific issues over time + +### **4. TestCoverage Class** +```csharp +public class TestCoverage +{ + public int TotalTests { get; set; } // Total tests in group + public int PassedTests { get; set; } // Tests that passed + public int FailedTests { get; set; } // Tests that failed + public double PassRate { get; set; } // Pass percentage + public double AverageAccuracy { get; set; } // Average OCR accuracy + public double AverageProcessingTime { get; set; } // Performance metric +} +``` + +### **5. Overall Coverage Summary** +```csharp +public TestCoverage OverallCoverage { get; set; } +``` +- Complete test suite performance snapshot +- Executive summary metrics +- Trend analysis baseline + +## 📊 Enhanced Reporting + +### **Console Output:** +``` +TEST RESULTS SUMMARY +================== +Test Suite: map +Total Tests: 5 +Passed: 4 +Failed: 1 +Pass Rate: 80.0% +Overall Accuracy: 85.5% +Duration: 2.3 minutes + +Category Coverage: + reward: 3/4 (75.0% pass rate, 88.3% avg accuracy) + inventory: 1/1 (100.0% pass rate, 82.5% avg accuracy) + snapit: 0/0 (0.0% pass rate, 0.0% avg accuracy) + +Language Coverage: + english: 3/3 (100.0% pass rate, 91.7% avg accuracy, 1100ms avg time) + korean: 1/2 (50.0% pass rate, 79.0% avg accuracy, 1400ms avg time) + japanese: 0/0 (0.0% pass rate, 0.0% avg accuracy, 0ms avg time) +``` + +### **JSON Output:** +```json +{ + "PassRate": 80.0, + "CategoryCoverage": { + "reward": { "PassRate": 75.0, "AverageAccuracy": 88.3 }, + "inventory": { "PassRate": 100.0, "AverageAccuracy": 82.5 } + }, + "LanguageCoverage": { + "english": { "PassRate": 100.0, "AverageAccuracy": 91.7 }, + "korean": { "PassRate": 50.0, "AverageAccuracy": 79.0 } + }, + "OverallCoverage": { + "PassRate": 80.0, + "AverageAccuracy": 85.5, + "AverageProcessingTime": 1220.0 + } +} +``` + +## 🚀 Benefits + +### **Quality Assurance:** +- **Pass Rate**: Quick health check of test suite +- **Coverage Analysis**: Identify gaps in test coverage +- **Performance Monitoring**: Track OCR processing times +- **Regression Detection**: Spot language-specific issues + +### **Development Insights:** +- **Language Performance**: Which languages need improvement +- **Category Issues**: Specific UI screen problems +- **Processing Bottlenecks**: Performance optimization targets +- **Trend Analysis**: Historical performance data + +### **CI/CD Integration:** +- **Exit Codes**: Build status based on pass rates +- **JSON Output**: Machine-readable results +- **Threshold Alerts**: Configurable pass rate requirements +- **Trend Tracking**: Performance over time + +## 📈 Usage Examples + +### **Set Quality Gates:** +```bash +# Fail build if pass rate < 90% +WFInfo.Tests.exe map.json test_images/ results.json +if [ $? -ne 0 ]; then + echo "Test suite pass rate below threshold!" + exit 1 +fi +``` + +### **Monitor Language Performance:** +```bash +# Check specific language coverage +WFInfo.Tests.exe map.json test_images/ results.json +# Parse JSON for LanguageCoverage +# Alert if any language < 80% pass rate +``` + +### **Performance Regression Detection:** +```bash +# Track processing time increases +WFInfo.Tests.exe map.json test_images/ results.json +# Compare AverageProcessingTime with baseline +# Alert on significant performance degradation +``` + +## 🎯 Result + +The test framework now provides **enterprise-grade coverage metrics**: +- **Comprehensive**: All aspects of test performance tracked +- **Actionable**: Clear insights for improvement +- **Automatable**: Perfect for CI/CD pipelines +- **Scalable**: Works for any number of tests/languages + +Perfect foundation for **quality assurance, performance monitoring, and regression detection**! 🚀 diff --git a/tests/EXTERNAL_DATA_STRUCTURE.md b/tests/EXTERNAL_DATA_STRUCTURE.md new file mode 100644 index 00000000..ddf8ef6c --- /dev/null +++ b/tests/EXTERNAL_DATA_STRUCTURE.md @@ -0,0 +1,149 @@ +# External Test Data Structure + +## 🎯 New Architecture + +The test framework now uses **external data files** instead of embedded test scenarios, providing better organization and flexibility. + +## 📁 File Structure + +``` +tests/ +├── map.json # Main test map (scenario references) +├── data/ # Test data directory +│ ├── test1.json # Test scenario 1 data +│ ├── test1.png # Test scenario 1 image +│ ├── test2.json # Test scenario 2 data +│ ├── test2.png # Test scenario 2 image +│ ├── test3.json # Test scenario 3 data +│ └── test3.png # Test scenario 3 image +├── run_tests.bat # Batch script +└── results/ # Generated test results +``` + +## 📋 map.json Structure + +```json +{ + "scenarios": [ + "data/test1", + "data/test2", + "data/test3" + ] +} +``` + +**Benefits:** +- **Clean**: Main map only contains scenario references +- **Flexible**: Easy to add/remove tests +- **Organized**: Test data separated from configuration +- **Scalable**: Works with any number of test scenarios + +## 📄 Individual Test Data Files + +### **data/test1.json** +```json +{ + "description": "Basic English reward screen with 4 items", + "resolution": "1920x1080", + "scaling": 100, + "theme": "orokin", + "language": "english", + "parts": { + "0": "Volt Prime Blueprint", + "1": "Mag Prime Blueprint", + "2": "Ash Prime Blueprint", + "3": "Trinity Prime Blueprint" + }, + "category": "reward", + "hdr": false, + "filters": [] +} +``` + +### **data/test2.json** +```json +{ + "description": "Korean fissure reward screen", + "resolution": "1920x1080", + "scaling": 125, + "theme": "lotus", + "language": "korean", + "parts": { + "0": "보 프라임 설계도" + }, + "category": "reward", + "hdr": false, + "filters": [] +} +``` + +### **data/test3.json** +```json +{ + "description": "Japanese inventory screen", + "resolution": "2560x1440", + "scaling": 150, + "theme": "tenno", + "language": "japanese", + "parts": { + "0": "Volt Prime 設計図", + "1": "Saryn Prime 設計図" + }, + "category": "inventory", + "hdr": true, + "filters": ["colorblind"] +} +``` + +## 🔄 Test Execution Flow + +1. **Load map.json** → Get scenario paths +2. **For each scenario:** + - Load `{scenario}.json` → Test configuration + - Load `{scenario}.png` → Test image + - Execute OCR with test settings + - Compare results with expected parts +3. **Generate comprehensive report** → JSON with coverage metrics + +## 🎯 Benefits + +### **Organization** +- **Separation of Concerns**: Test data separate from test logic +- **Modularity**: Each test is self-contained +- **Maintainability**: Easy to update individual tests +- **Scalability**: Add tests without touching core framework + +### **Flexibility** +- **Dynamic Loading**: Tests loaded at runtime from file system +- **Easy Updates**: Modify test data without code changes +- **Version Control**: Track changes to individual test scenarios +- **CI/CD Ready**: External data works well with pipelines + +### **Coverage Analysis** +- **Path-based Classification**: Extract language/category from file paths +- **Comprehensive Metrics**: Pass rates, accuracy, processing times +- **Performance Tracking**: Per-language and per-category analysis + +## 🚀 Usage + +### **Adding New Tests:** +```bash +# 1. Create new test files +echo '{"description": "...", "language": "...", "parts": {...}}' > data/test4.json +# Add corresponding screenshot +cp screenshot.png data/test4.png + +# 2. Update map.json +echo '["data/test1", "data/test2", "data/test3", "data/test4"]' > map.json +``` + +### **Running Tests:** +```bash +# Run all tests +WFInfo.Tests.exe map.json data/ results.json + +# Run specific test +WFInfo.Tests.exe map.json data/ results.json --filter "data/test1" +``` + +This external data structure provides **maximum flexibility** while maintaining **clean organization** and **comprehensive coverage metrics**! 🚀 diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 00000000..9920ca32 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,182 @@ +# WFInfo OCR Test Framework + +Regression and accuracy testing for WFInfo's OCR pipeline. Runs **headlessly** from the command line using the **real** WFInfo OCR methods (no mocks or copied code). + +## How It Works + +1. The runner reads `map.json` which lists scenario paths. +2. Each scenario is a **PNG + JSON pair** (e.g. `data/test1.png` + `data/test1.json`). +3. The JSON spec defines language, theme, HDR, scaling, category, and expected part names. +4. WFInfo's real OCR pipeline processes the screenshot: + - **Reward screens**: `ExtractPartBoxAutomatically` → `GetTextFromImage` → `GetPartName` + - **SnapIt**: `ScaleUpAndFilter` → `FindAllParts` → `GetPartName` +5. Actual results are compared against expected parts; accuracy and pass/fail are reported. + +## Directory Structure + +```text +tests/ +├── map.json # Lists scenarios to run +├── run_tests.bat # One-click Windows runner +├── data/ +│ ├── test1.json # Test spec +│ ├── test1.png # Corresponding screenshot +│ ├── test2.json +│ ├── test2.png +│ └── ... +``` + +## Quick Start + +### 1. Build the project +```batch +dotnet build WFInfo.sln -c Release +``` + +### 2. Run tests +```batch +cd tests +run_tests.bat +``` + +Or manually: +```batch +WFInfo.exe --test map.json results.json +WFInfo.exe map.json results.json +``` + +If no output file is specified, results go to `test_results_.json`. + +### 3. Check results +The runner prints a summary to stdout and writes detailed JSON to the output file. + +## Test Spec Format (JSON) + +Each test scenario JSON file: + +```json +{ + "description": "Basic English reward screen with 4 items", + "resolution": "1920x1080", + "scaling": 100, + "theme": "orokin", + "language": "english", + "parts": { + "0": "Volt Prime Blueprint", + "1": "Mag Prime Blueprint", + "2": "Ash Prime Blueprint", + "3": "Trinity Prime Blueprint" + }, + "category": "reward", + "hdr": false, + "filters": [] +} +``` + +### Fields + +| Field | Required | Description | +|-------|----------|-------------| +| `description` | No | Human-readable description | +| `resolution` | No | Source resolution (informational) | +| `scaling` | Yes | UI scaling percentage (100 = 100%) | +| `theme` | Yes | UI theme name (see below) | +| `language` | Yes | Language name (see below) | +| `parts` | Yes | Map of index → expected part name (English) | +| `category` | Yes | `reward` or `snapit` | +| `hdr` | Yes | Whether the screenshot is HDR | +| `filters` | No | Optional filter tags (e.g. `colorblind`) | + +## map.json Format + +```json +{ + "scenarios": [ + "data/test1", + "data/test2", + "data/test3" + ] +} +``` + +Each entry is a path (relative to `map.json`) without extension. The runner appends `.json` and `.png`. + +## Supported Values + +### Categories +- **`reward`** — Fissure reward screen (1-4 items) +- **`snapit`** — SnapIt inventory scanning + +### Languages +`english`, `korean`, `japanese`, `simplified chinese`, `traditional chinese`, `thai`, `french`, `ukrainian`, `italian`, `german`, `spanish`, `portuguese`, `polish`, `turkish`, `russian` + +### Themes +`orokin`, `tenno`, `grineer`, `corpus`, `infested`, `lotus`, `fortuna`, `baruuk`, `equinox`, `dark lotus` / `dark_lotus`, `zephyr`, `high contrast` / `high_contrast`, `legacy`, `auto` + +## Output Format + +```json +{ + "TestSuiteName": "map", + "TotalTests": 3, + "PassedTests": 2, + "FailedTests": 1, + "ErrorTests": 0, + "PassRate": 66.7, + "OverallAccuracy": 83.3, + "CategoryCoverage": { ... }, + "LanguageCoverage": { ... }, + "OverallCoverage": { ... }, + "TestResults": [ + { + "TestCaseName": "test1", + "Language": "english", + "Theme": "orokin", + "Category": "reward", + "Success": true, + "AccuracyScore": 100.0, + "ProcessingTimeMs": 1250, + "ExpectedParts": ["Volt Prime Blueprint", ...], + "ActualParts": ["Volt Prime Blueprint", ...], + "MissingParts": [], + "ExtraParts": [] + } + ] +} +``` + +## Exit Codes + +| Code | Meaning | +|------|---------| +| 0 | All tests passed | +| 1 | Some tests failed | +| 2 | Fatal error (missing files, init failure, etc.) | + +## Architecture + +The test runner calls WFInfo's internal methods directly: + +- `OCR.InitForTest()` — headless OCR initialization (real TesseractService, no sound/screenshot services) +- `OCR.ProcessRewardScreenForTest()` — full reward pipeline: extract part boxes → Tesseract OCR → Levenshtein matching +- `OCR.ProcessSnapItForTest()` — full SnapIt pipeline: theme detection → filter → find parts → matching +- `Data.GetPartName()` — real Levenshtein-based name matching against the market database +- `LanguageProcessorFactory` — real language-specific processing (CJK, Cyrillic, Latin, etc.) + +Settings (locale, theme, scaling) are applied via `ApplicationSettings.GlobalSettings` before each test, and Tesseract engines are reloaded when the language changes. + +## Adding New Tests + +1. Take a screenshot in Warframe +2. Save as `tests/data/.png` +3. Create `tests/data/.json` with the spec (see format above) +4. Add `"data/"` to `map.json` scenarios list +5. Run `run_tests.bat` + +## Troubleshooting + +- **"Databases not ready"** — First run downloads market data from the internet. Ensure connectivity. +- **"PNG not found"** — The `.png` must be next to the `.json` with the same base name. +- **Low accuracy** — Check that expected part names match WFInfo's English database names exactly. +- **Tesseract errors** — Ensure tessdata files are available in `%APPDATA%\WFInfo\tessdata\`. +- **Debug logs** — Check `%APPDATA%\WFInfo\debug.log` for detailed OCR pipeline logs. diff --git a/tests/TEST_EXECUTION_FLOW.md b/tests/TEST_EXECUTION_FLOW.md new file mode 100644 index 00000000..6747f538 --- /dev/null +++ b/tests/TEST_EXECUTION_FLOW.md @@ -0,0 +1,148 @@ +# Test Execution Flow - How WFInfo.exe Redirects to Tests + +## 🎯 Command-Line Detection Logic + +The test framework is integrated into the main WFInfo executable through **command-line argument detection** in `CustomEntrypoint.cs`. + +## 🔄 Execution Flow + +### **Normal UI Mode (Default):** +```bash +WFInfo.exe +# → Launches normal WFInfo UI application +# → App.Main() is called +``` + +### **Test Execution Mode (When arguments detected):** +```bash +WFInfo.exe map.json data/ results.json +# → Detects test arguments +# → Redirects to TestProgram.RunTests() +# → Runs OCR test framework +``` + +## 📋 Detection Logic + +**Location:** `CustomEntrypoint.cs` lines 86-107 + +```csharp +// Check for test execution arguments +string[] args = Environment.GetCommandLineArgs(); +if (args.Length >= 4 && (args[1].EndsWith(".json") || args[1].Contains("map"))) +{ + // Test execution mode detected! + Console.WriteLine("WFInfo OCR Test Runner"); + Console.WriteLine("======================="); + + // Redirect to test framework + TestProgram.RunTests(args).Wait(); + return; // Skip UI launch +} + +// Normal UI mode continues... +App.Main(); // Launch WFInfo UI +``` + +## 🎯 Argument Pattern Matching + +**Test Mode Detection:** +- **Minimum args:** 4+ arguments +- **Key indicator:** Second argument ends with `.json` OR contains `"map"` +- **Example patterns:** + - `WFInfo.exe map.json data/ results.json` ✅ + - `WFInfo.exe tests/map.json images/ output.json` ✅ + - `WFInfo.exe config.json` ❌ (not enough args) + - `WFInfo.exe --help` ❌ (doesn't match pattern) + +## 🚀 Test Program Integration + +**TestProgram.cs** provides two entry points: + +```csharp +public static void Main(string[] args) +{ + RunTests(args).Wait(); // Direct call +} + +public static async Task RunTests(string[] args) +{ + // Test execution logic + // External data loading + // OCR processing + // Coverage metrics + // JSON reporting +} +``` + +## 📁 Complete Execution Chain + +``` +1. User runs: WFInfo.exe map.json data/ results.json +2. CustomEntrypoint.Main() detects test arguments +3. Redirects to TestProgram.RunTests(args) +4. TestProgram loads external test data +5. OCR processing with comprehensive metrics +6. Results saved to JSON file +7. Console output with coverage analysis +8. Exit with appropriate code (0=success, 1=partial failure, etc.) +``` + +## 🔧 Build & Run Instructions + +### **Build:** +```bash +cd \WFinfo +dotnet build --configuration Release +# Executable: bin\Release\net48\WFInfo.exe +``` + +### **Run Tests:** +```bash +cd \WFinfo\tests +..\bin\Release\net48\WFInfo.exe map.json data/ results.json +``` + +### **Run UI:** +```bash +cd \WFinfo +bin\Release\net48\WFInfo.exe +# (no arguments = normal UI mode) +``` + +## 🎯 Key Benefits + +### **Single Executable:** +- **No separate test binary needed** +- **Same executable** for UI and testing +- **Simplified deployment** and distribution + +### **Smart Detection:** +- **Automatic mode selection** based on arguments +- **No configuration files** needed for mode switching +- **Backward compatible** with existing workflows + +### **Integrated Testing:** +- **Full access** to WFInfo internals +- **Same OCR engines** as production +- **Identical behavior** to real application + +### **CI/CD Ready:** +- **Command-line interface** perfect for automation +- **JSON output** for result processing +- **Exit codes** for build status integration + +## 📊 Test Framework Features + +When running in test mode, WFInfo.exe provides: + +- **External Data Loading:** `{scenario}.json` + `{scenario}.png` pairs +- **Multi-Language Support:** All 15 supported languages +- **Coverage Metrics:** Pass rates, accuracy, processing times +- **Theme Testing:** All WFInfo themes supported +- **HDR Support:** Test with/without HDR +- **Filter Testing:** Accessibility filter validation +- **Comprehensive Reporting:** JSON output with detailed metrics + +## 🚀 Result + +The test framework is **fully integrated** into WFInfo.exe with **smart command-line detection** - providing a **unified solution** for both UI application and automated testing! 🎯 diff --git a/tests/data/test1.json b/tests/data/test1.json new file mode 100644 index 00000000..4f2e2c8b --- /dev/null +++ b/tests/data/test1.json @@ -0,0 +1,16 @@ +{ + "description": "Basic English reward screen with 4 items", + "resolution": "1920x1080", + "scaling": 100, + "theme": "orokin", + "language": "english", + "parts": { + "0": "Volt Prime Blueprint", + "1": "Mag Prime Blueprint", + "2": "Ash Prime Blueprint", + "3": "Trinity Prime Blueprint" + }, + "category": "reward", + "hdr": false, + "filters": [] +} diff --git a/tests/data/test2.json b/tests/data/test2.json new file mode 100644 index 00000000..b2fcba1f --- /dev/null +++ b/tests/data/test2.json @@ -0,0 +1,13 @@ +{ + "description": "Korean fissure reward screen", + "resolution": "1920x1080", + "scaling": 125, + "theme": "lotus", + "language": "korean", + "parts": { + "0": "보 프라임 설계도" + }, + "category": "reward", + "hdr": false, + "filters": [] +} diff --git a/tests/data/test3.json b/tests/data/test3.json new file mode 100644 index 00000000..7c8fdc33 --- /dev/null +++ b/tests/data/test3.json @@ -0,0 +1,14 @@ +{ + "description": "Japanese inventory screen", + "resolution": "2560x1440", + "scaling": 150, + "theme": "tenno", + "language": "japanese", + "parts": { + "0": "Volt Prime 設計図", + "1": "Saryn Prime 設計図" + }, + "category": "inventory", + "hdr": true, + "filters": ["colorblind"] +} diff --git a/tests/map.json b/tests/map.json new file mode 100644 index 00000000..3cb6a145 --- /dev/null +++ b/tests/map.json @@ -0,0 +1,7 @@ +{ + "scenarios": [ + "data/test1", + "data/test2", + "data/test3" + ] +} diff --git a/tests/run_tests.bat b/tests/run_tests.bat new file mode 100644 index 00000000..7b987980 --- /dev/null +++ b/tests/run_tests.bat @@ -0,0 +1,74 @@ +@echo off +setlocal enabledelayedexpansion + +echo WFInfo OCR Test Runner +echo ======================== +echo. + +REM Get script directory (always ends with \) +set "SCRIPT_DIR=%~dp0" + +REM Locate WFInfo.exe - try Release first, then Debug +set "EXE=" +if exist "%SCRIPT_DIR%..\bin\Release\net48\WFInfo.exe" ( + set "EXE=%SCRIPT_DIR%..\bin\Release\net48\WFInfo.exe" +) else if exist "%SCRIPT_DIR%..\bin\Debug\net48\WFInfo.exe" ( + set "EXE=%SCRIPT_DIR%..\bin\Debug\net48\WFInfo.exe" +) else if exist "%SCRIPT_DIR%..\WFInfo\bin\Release\net48\WFInfo.exe" ( + set "EXE=%SCRIPT_DIR%..\WFInfo\bin\Release\net48\WFInfo.exe" +) else if exist "%SCRIPT_DIR%..\WFInfo\bin\Debug\net48\WFInfo.exe" ( + set "EXE=%SCRIPT_DIR%..\WFInfo\bin\Debug\net48\WFInfo.exe" +) + +if "%EXE%"=="" ( + echo ERROR: WFInfo.exe not found. Build the project first. + echo Looked in: + echo %SCRIPT_DIR%..\bin\Release\net48\WFInfo.exe + echo %SCRIPT_DIR%..\bin\Debug\net48\WFInfo.exe + echo %SCRIPT_DIR%..\WFInfo\bin\Release\net48\WFInfo.exe + echo %SCRIPT_DIR%..\WFInfo\bin\Debug\net48\WFInfo.exe + exit /b 2 +) + +REM Verify map.json exists +if not exist "%SCRIPT_DIR%map.json" ( + echo ERROR: map.json not found in %SCRIPT_DIR% + exit /b 2 +) + +REM Generate timestamp for output file +for /f "tokens=2 delims==" %%I in ('wmic os get localdatetime /value') do set "TIMESTAMP=%%I" +set "TIMESTAMP=%TIMESTAMP:~0,8%_%TIMESTAMP:~8,6%" + +REM Fallback timestamp if wmic failed +if "%TIMESTAMP%"=="" ( + set "TIMESTAMP=%DATE:~-4%_%DATE:~-10,2%_%DATE:~-7,2%_%TIME:~0,2%%TIME:~3,2%%TIME:~6,2%" + set "TIMESTAMP=%TIMESTAMP: =0%" +) + +REM Parse arguments +set "OUTPUT_FILE=%~1" +if "%OUTPUT_FILE%"=="" ( + set "OUTPUT_FILE=%SCRIPT_DIR%test_results_%TIMESTAMP%.json" +) + +echo Executable: %EXE% +echo Test Map: %SCRIPT_DIR%map.json +echo Output: %OUTPUT_FILE% +echo. + +REM Run tests via WFInfo.exe --test map.json output.json +"%EXE%" --test "%SCRIPT_DIR%map.json" "%OUTPUT_FILE%" +set "EXIT_CODE=%ERRORLEVEL%" + +echo. +if %EXIT_CODE% EQU 0 ( + echo All tests passed! +) else if %EXIT_CODE% EQU 1 ( + echo Some tests failed. Check results for details. +) else ( + echo Test execution encountered an error. +) + +echo Results saved to: %OUTPUT_FILE% +exit /b %EXIT_CODE% diff --git a/tests/usage_example.md b/tests/usage_example.md new file mode 100644 index 00000000..44312d8f --- /dev/null +++ b/tests/usage_example.md @@ -0,0 +1,50 @@ +# OCR Test Framework Usage Example + +## Quick Start + +1. **Create test images** and place them in `tests/test_images/` + - `english_reward_basic.png` + - `korean_fissure.png` + - `japanese_snapit.png` + - etc. + +2. **Run tests** using the batch script: + ```batch + cd tests + run_tests.bat test_images\ + ``` + +3. **Or run manually**: + ```bash + WFInfo.exe map.json test_images/ results.json + ``` + +## Expected Output + +The test framework will generate a comprehensive JSON report with: + +```json +{ + "TestSuiteName": "map", + "TotalTests": 5, + "PassedTests": 4, + "FailedTests": 1, + "OverallAccuracy": 85.5, + "LanguageAccuracy": { + "english": 90.0, + "korean": 80.0 + }, + "TestResults": [...] +} +``` + +## Integration Notes + +The test framework uses: +- **Real OCR engines** with language-specific algorithms +- **Actual Levenshtein distance** implementations for each language +- **Proper character normalization** for international text +- **Theme detection** and scaling simulation +- **Comprehensive validation** and error reporting + +This provides automated regression testing for all supported languages (English, Korean, Chinese Simplified/Traditional, French, Ukrainian, Italian, German, Spanish, Portuguese, Polish, Russian) across different UI themes, resolutions, and game scenarios. Note: Thai, Japanese, and Turkish are supported in the main application but excluded from automated testing.