diff --git a/.gitattributes b/.gitattributes
index 1ff0c423..4c6690d0 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -2,62 +2,4 @@
# Set default behavior to automatically normalize line endings.
###############################################################################
* text=auto
-
-###############################################################################
-# Set default behavior for command prompt diff.
-#
-# This is need for earlier builds of msysgit that does not have it on by
-# default for csharp files.
-# Note: This is only used by command line
-###############################################################################
-#*.cs diff=csharp
-
-###############################################################################
-# Set the merge driver for project and solution files
-#
-# Merging from the command prompt will add diff markers to the files if there
-# are conflicts (Merging from VS is not affected by the settings below, in VS
-# the diff markers are never inserted). Diff markers may cause the following
-# file extensions to fail to load in VS. An alternative would be to treat
-# these files as binary and thus will always conflict and require user
-# intervention with every merge. To do so, just uncomment the entries below
-###############################################################################
-#*.sln merge=binary
-#*.csproj merge=binary
-#*.vbproj merge=binary
-#*.vcxproj merge=binary
-#*.vcproj merge=binary
-#*.dbproj merge=binary
-#*.fsproj merge=binary
-#*.lsproj merge=binary
-#*.wixproj merge=binary
-#*.modelproj merge=binary
-#*.sqlproj merge=binary
-#*.wwaproj merge=binary
-
-###############################################################################
-# behavior for image files
-#
-# image files are treated as binary by default.
-###############################################################################
-#*.jpg binary
-#*.png binary
-#*.gif binary
-
-###############################################################################
-# diff behavior for common document formats
-#
-# Convert binary document formats to text before diffing them. This feature
-# is only available from the command line. Turn it on by uncommenting the
-# entries below.
-###############################################################################
-#*.doc diff=astextplain
-#*.DOC diff=astextplain
-#*.docx diff=astextplain
-#*.DOCX diff=astextplain
-#*.dot diff=astextplain
-#*.DOT diff=astextplain
-#*.pdf diff=astextplain
-#*.PDF diff=astextplain
-#*.rtf diff=astextplain
-#*.RTF diff=astextplain
+*.bat text eol=crlf
\ No newline at end of file
diff --git a/TestCharacterRanges.cs b/TestCharacterRanges.cs
new file mode 100644
index 00000000..c302d75b
--- /dev/null
+++ b/TestCharacterRanges.cs
@@ -0,0 +1,58 @@
+using System;
+using WFInfo.LanguageProcessing;
+using WFInfo.Settings;
+
+namespace WFInfo.Test
+{
+ ///
+ /// Simple test to verify character range generation works correctly
+ ///
+ public class TestCharacterRanges
+ {
+ public static void TestCharacterRanges()
+ {
+ Console.WriteLine("Testing character range generation...");
+
+ // Create a mock settings object
+ var settings = new TestApplicationSettings();
+
+ try
+ {
+ // Test Japanese processor
+ var japaneseProcessor = new JapaneseLanguageProcessor(settings);
+ var japaneseWhitelist = japaneseProcessor.CharacterWhitelist;
+ Console.WriteLine($"Japanese whitelist length: {japaneseWhitelist.Length}");
+
+ // Test Korean processor
+ var koreanProcessor = new KoreanLanguageProcessor(settings);
+ var koreanWhitelist = koreanProcessor.CharacterWhitelist;
+ Console.WriteLine($"Korean whitelist length: {koreanWhitelist.Length}");
+
+ // Test Chinese processors
+ var simplifiedProcessor = new SimplifiedChineseLanguageProcessor(settings);
+ var simplifiedWhitelist = simplifiedProcessor.CharacterWhitelist;
+ Console.WriteLine($"Simplified Chinese whitelist length: {simplifiedWhitelist.Length}");
+
+ var traditionalProcessor = new TraditionalChineseLanguageProcessor(settings);
+ var traditionalWhitelist = traditionalProcessor.CharacterWhitelist;
+ Console.WriteLine($"Traditional Chinese whitelist length: {traditionalWhitelist.Length}");
+
+ Console.WriteLine("All character range tests passed!");
+ }
+ catch (Exception ex)
+ {
+ Console.WriteLine($"Error testing character ranges: {ex.Message}");
+ throw;
+ }
+ }
+ }
+
+ ///
+ /// Mock application settings for testing
+ ///
+ public class TestApplicationSettings : IReadOnlyApplicationSettings
+ {
+ public string Locale => "en";
+ // Add other required properties as needed
+ }
+}
diff --git a/WFInfo/CustomEntrypoint.cs b/WFInfo/CustomEntrypoint.cs
index d70cc31f..20dcd80c 100644
--- a/WFInfo/CustomEntrypoint.cs
+++ b/WFInfo/CustomEntrypoint.cs
@@ -14,6 +14,7 @@
using System.Linq;
using System.CodeDom;
using Tesseract;
+using WFInfo.Tests;
namespace WFInfo
{
@@ -83,6 +84,41 @@ public static void Main()
Directory.CreateDirectory(appPath);
+ // Check for test execution arguments
+ // Usage: WFInfo.exe [--test] map.json [output.json]
+ string[] args = Environment.GetCommandLineArgs().Skip(1).ToArray();
+ bool isTestMode = false;
+
+ if (args.Length >= 1 && (args[0].Equals("--test", StringComparison.OrdinalIgnoreCase) ||
+ args[0].Equals("-test", StringComparison.OrdinalIgnoreCase) ||
+ args[0].Equals("--map", StringComparison.OrdinalIgnoreCase)))
+ {
+ isTestMode = true;
+ args = args.Skip(1).ToArray(); // strip flag
+ }
+ else if (args.Length >= 1 && args[0].EndsWith(".json", StringComparison.OrdinalIgnoreCase))
+ {
+ isTestMode = true;
+ }
+
+ if (isTestMode)
+ {
+ try
+ {
+ Console.WriteLine("WFInfo OCR Test Runner");
+ Console.WriteLine("=======================");
+ TestProgram.RunTests(args).GetAwaiter().GetResult();
+ return;
+ }
+ catch (Exception ex)
+ {
+ Console.WriteLine($"Test execution failed: {ex.Message}");
+ Console.WriteLine(ex.StackTrace);
+ Environment.Exit(1);
+ return;
+ }
+ }
+
string thisprocessname = Process.GetCurrentProcess().ProcessName;
string version = Assembly.GetExecutingAssembly().GetName().Version.ToString();
if (Process.GetProcesses().Count(p => p.ProcessName == thisprocessname) > 1)
diff --git a/WFInfo/Data.cs b/WFInfo/Data.cs
index 1f0b7b4d..2e653365 100644
--- a/WFInfo/Data.cs
+++ b/WFInfo/Data.cs
@@ -18,6 +18,7 @@
using WFInfo.Services.WarframeProcess;
using WFInfo.Services.WindowInfo;
using WFInfo.Settings;
+using WFInfo.LanguageProcessing;
namespace WFInfo
{
@@ -30,28 +31,6 @@ class Data
public JObject equipmentData; // Contains equipmentData from Warframe PC Drops {: {"vaulted": true, "PARTS": {:{"relic_name":|"","count":}, ...}}, ...}
public JObject nameData; // Contains relic to market name translation {: }
- private static readonly List>> korean = new List>>() {
- new Dictionary>() {
- { 0, new List{ 6, 7, 8, 16 } }, // ㅁ, ㅂ, ㅃ, ㅍ
- { 1, new List{ 2, 3, 4, 16, 5, 9, 10 } }, // ㄴ, ㄷ, ㄸ, ㅌ, ㄹ, ㅅ, ㅆ
- { 2, new List{ 12, 13, 14 } }, // ㅈ, ㅉ, ㅊ
- { 3, new List{ 0, 1, 15, 11, 18 } } // ㄱ, ㄲ, ㅋ, ㅇ, ㅎ
- },
- new Dictionary>() {
- { 0, new List{ 20, 5, 1, 7, 3, 19 } }, // ㅣ, ㅔ, ㅐ, ㅖ, ㅒ, ㅢ
- { 1, new List{ 16, 11, 15, 10 } }, // ㅟ, ㅚ, ㅞ, ㅙ
- { 2, new List{ 4, 0, 6, 2, 14, 9 } }, // ㅓ, ㅏ, ㅕ, ㅑ, ㅝ, ㅘ
- { 3, new List{ 18, 13, 8, 17, 12 } } // ㅡ, ㅜ, ㅗ, ㅠ, ㅛ
- },
- new Dictionary>() {
- { 0, new List{ 16, 17, 18, 26 } }, // ㅁ, ㅂ, ㅄ, ㅍ
- { 1, new List{ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 19, 20, 25 } }, // ㄴ, ㄵ, ㄶ, ㄷ, ㄹ, ㄺ, ㄻ, ㄼ, ㄽ, ㄾ, ㄿ, ㅀ, ㅅ, ㅆ, ㅌ
- { 2, new List{ 22, 23 } }, // ㅈ, ㅊ
- { 3, new List{ 1, 2, 3, 24, 21, 27 } }, // ㄱ, ㄲ, ㄳ, ㅋ, ㅑ, ㅎ
- { 4, new List{ 0 } }, //
- }
- };
-
private readonly string applicationDirectory = Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData) + @"\WFInfo";
private readonly string marketItemsPath;
private readonly string marketDataPath;
@@ -60,7 +39,6 @@ class Data
private readonly string nameDataPath;
private readonly string filterAllJsonFallbackPath;
private readonly string sheetJsonFallbackPath;
- private readonly Dictionary wfmItemsFallbackPaths;
public string JWT; // JWT is the security key, store this as email+pw combo'
private ClientWebSocket marketSocket = new ClientWebSocket();
private CancellationTokenSource marketSocketCancellation = new CancellationTokenSource();
@@ -109,6 +87,9 @@ public Data(IReadOnlyApplicationSettings settings, IProcessFinder process, IWind
_process = process;
_window = window;
+ // Initialize the language processor factory
+ LanguageProcessorFactory.Initialize(settings);
+
Main.AddLog("Initializing Databases");
marketItemsPath = applicationDirectory + @"\market_items.json";
marketDataPath = applicationDirectory + @"\market_data.json";
@@ -117,12 +98,7 @@ public Data(IReadOnlyApplicationSettings settings, IProcessFinder process, IWind
nameDataPath = applicationDirectory + @"\name_data.json";
filterAllJsonFallbackPath = applicationDirectory + @"\fallback_equipment_list.json";
sheetJsonFallbackPath = applicationDirectory + @"\fallback_price_sheet.json";
- wfmItemsFallbackPaths = new Dictionary();
- string[] locales = new string[] { "en", "ko" };
- foreach (string locale in locales)
- {
- wfmItemsFallbackPaths[locale] = applicationDirectory + @"\fallback_names_" + locale + ".json";
- }
+ // wfmItemsFallbackPath will be computed per-request in GetWfmItemList
Directory.CreateDirectory(applicationDirectory);
@@ -229,17 +205,42 @@ public async Task ReloadItems()
items = JArray.FromObject(localizedItems.Data["data"]);
foreach (var item in items)
{
- string name = item["slug"].ToString();
- if (name.Contains("prime") && tempMarketItems.ContainsKey(item["id"].ToString()))
- tempMarketItems[item["id"].ToString()] = tempMarketItems[item["id"].ToString()] + "|" + item["i18n"][_settings.Locale]["name"];
+ string itemId = item["id"].ToString();
+ if (tempMarketItems.ContainsKey(itemId))
+ {
+ // Validate presence of locale data and throw exception if missing
+ if (item["i18n"] == null)
+ {
+ throw new KeyNotFoundException($"Item {itemId} missing i18n data entirely");
+ }
+
+ if (item["i18n"][_settings.Locale] == null)
+ {
+ throw new KeyNotFoundException($"Item {itemId} missing locale data for {_settings.Locale}");
+ }
+
+ if (item["i18n"][_settings.Locale]["name"] == null)
+ {
+ throw new KeyNotFoundException($"Item {itemId} missing name field for locale {_settings.Locale}");
+ }
+
+ string localizedName = item["i18n"][_settings.Locale]["name"].ToString();
+ tempMarketItems[itemId] = tempMarketItems[itemId] + "|" + localizedName;
+ }
}
+ // Add locale metadata for cache validation
+ tempMarketItems["locale"] = _settings.Locale;
+
// Atomically replace marketItems under lock
lock (marketItemsLock)
{
marketItems = tempMarketItems;
}
+ // Save only the updated marketItems to file
+ SaveDatabase(marketItemsPath, marketItems);
+
Main.AddLog("Item database has been downloaded");
return enItems.IsFallback || localizedItems.IsFallback;
}
@@ -425,6 +426,9 @@ private async Task LoadMarketItem(string url)
private async Task<(JObject Data, bool IsFallback)> GetWfmItemList(string locale)
{
+ // Compute locale-specific fallback path per-request
+ string localeSpecificFallbackPath = Path.Combine(applicationDirectory, $"fallback_names.{locale}.json");
+
try
{
using (var request = new HttpRequestMessage()
@@ -440,30 +444,33 @@ private async Task LoadMarketItem(string url)
var response = await client.SendAsync(request).ConfigureAwait(false);
var body = await response.Content.ReadAsStringAsync().ConfigureAwait(false);
var data = JsonConvert.DeserializeObject(body);
- if (wfmItemsFallbackPaths.TryGetValue(locale, out var fallbackPath))
+
+ // Validate payload structure before caching
+ if (data != null && data["data"] != null && data["data"] is JArray)
{
- File.WriteAllText(fallbackPath, body);
+ File.WriteAllText(localeSpecificFallbackPath, body);
+ return (data, false);
+ }
+ else
+ {
+ Main.AddLog($"Invalid payload structure received from {wfmItemsUrl}, using fallback file {localeSpecificFallbackPath}");
+ throw new InvalidDataException($"Invalid JSON payload structure from {wfmItemsUrl}");
}
- return (data, false);
}
}
catch (Exception ex)
{
- if (wfmItemsFallbackPaths.TryGetValue(locale, out var fallbackPath))
+ Main.AddLog("Failed to fetch/parse " + wfmItemsUrl + ", using file " + localeSpecificFallbackPath + Environment.NewLine + ex.ToString());
+ if (File.Exists(localeSpecificFallbackPath))
{
- Main.AddLog("Failed to fetch/parse " + wfmItemsUrl + ", using file " + fallbackPath + Environment.NewLine + ex.ToString());
- if (File.Exists(fallbackPath))
- {
- string response = File.ReadAllText(fallbackPath);
- JObject data = JsonConvert.DeserializeObject(response);
- return (data, true);
- }
+ string response = File.ReadAllText(localeSpecificFallbackPath);
+ JObject data = JsonConvert.DeserializeObject(response);
+ return (data, true);
}
else
{
- Main.AddLog("Failed to fetch/parse " + wfmItemsUrl + ", and no fallback path found for locale: " + locale + Environment.NewLine + ex.ToString());
+ throw new AggregateException("No local fallback found", ex);
}
- throw new AggregateException("No local fallback found", ex);
}
}
@@ -576,7 +583,16 @@ private JObject ParseFileOrMakeNew(string path, ref bool parseHasFailed)
{
if (File.Exists(path))
{
- return JsonConvert.DeserializeObject(File.ReadAllText(path));
+ try
+ {
+ return JsonConvert.DeserializeObject(File.ReadAllText(path));
+ }
+ catch (Exception ex)
+ {
+ Main.AddLog($"Failed to parse {path}: {ex.Message}");
+ parseHasFailed = true;
+ return null;
+ }
}
Main.AddLog(path + " missing, loading blank");
parseHasFailed = true;
@@ -595,25 +611,45 @@ public async Task UpdateInner(bool force)
if (marketData == null)
{
marketData = ParseFileOrMakeNew(marketDataPath, ref parseHasFailed);
+ if (marketData == null)
+ {
+ throw new InvalidDataException($"Failed to parse marketData from '{marketDataPath}'. JSON deserialization returned null.");
+ }
}
lock (marketItemsLock)
{
if (marketItems == null)
{
marketItems = ParseFileOrMakeNew(marketItemsPath, ref parseHasFailed);
+ if (marketItems == null)
+ {
+ throw new InvalidDataException($"Failed to parse marketItems from '{marketItemsPath}'. JSON deserialization returned null.");
+ }
}
}
if (equipmentData == null)
{
equipmentData = ParseFileOrMakeNew(equipmentDataPath, ref parseHasFailed);
+ if (equipmentData == null)
+ {
+ throw new InvalidDataException($"Failed to parse equipmentData from '{equipmentDataPath}'. JSON deserialization returned null.");
+ }
}
if (relicData == null)
{
relicData = ParseFileOrMakeNew(relicDataPath, ref parseHasFailed);
+ if (relicData == null)
+ {
+ throw new InvalidDataException($"Failed to parse relicData from '{relicDataPath}'. JSON deserialization returned null.");
+ }
}
if (nameData == null)
{
nameData = ParseFileOrMakeNew(nameDataPath, ref parseHasFailed);
+ if (nameData == null)
+ {
+ throw new InvalidDataException($"Failed to parse nameData from '{nameDataPath}'. JSON deserialization returned null.");
+ }
}
string oldMarketTimeText;
@@ -829,185 +865,44 @@ public int GetDifference(char c1, char c2)
public int LevenshteinDistance(string s, string t)
{
- switch (_settings.Locale)
- {
- case "ko":
- // for korean
- return LevenshteinDistanceKorean(s, t);
- default:
- return LevenshteinDistanceDefault(s, t);
- }
- }
-
- public static int LevenshteinDistanceDefault(string s, string t)
- {
- // Levenshtein Distance determines how many character changes it takes to form a known result
- // For example: Nuvo Prime is closer to Nova Prime (2) then Ash Prime (4)
- // For more info see: https://en.wikipedia.org/wiki/Levenshtein_distance
- s = s.ToLower(Main.culture);
- t = t.ToLower(Main.culture);
- int n = s.Length;
- int m = t.Length;
- int[,] d = new int[n + 1, m + 1];
-
- if (n == 0 || m == 0)
- return n + m;
-
- d[0, 0] = 0;
-
- int count = 0;
- for (int i = 1; i <= n; i++)
- d[i, 0] = (s[i - 1] == ' ' ? count : ++count);
-
- count = 0;
- for (int j = 1; j <= m; j++)
- d[0, j] = (t[j - 1] == ' ' ? count : ++count);
-
- for (int i = 1; i <= n; i++)
- for (int j = 1; j <= m; j++)
- {
- // deletion of s
- int opt1 = d[i - 1, j];
- if (s[i - 1] != ' ')
- opt1++;
-
- // deletion of t
- int opt2 = d[i, j - 1];
- if (t[j - 1] != ' ')
- opt2++;
-
- // swapping s to t
- int opt3 = d[i - 1, j - 1];
- if (t[j - 1] != s[i - 1])
- opt3++;
- d[i, j] = Math.Min(Math.Min(opt1, opt2), opt3);
- }
-
-
-
- return d[n, m];
- }
-
- // This isn't used anymore?!
- public static bool IsKorean(String str)
- {
- // Safeguard for empty strings that will give false positives and/or crashes
- if (string.IsNullOrEmpty(str)) return false;
- char c = str[0];
- if (0x1100 <= c && c <= 0x11FF) return true;
- if (0x3130 <= c && c <= 0x318F) return true;
- if (0xAC00 <= c && c <= 0xD7A3) return true;
- return false;
+ var processor = LanguageProcessorFactory.GetCurrentProcessor();
+ return processor.CalculateLevenshteinDistance(s, t);
}
public string GetLocaleNameData(string s)
{
- string localeName = "";
-
- lock (marketItemsLock)
- {
- if (marketItems != null) // Add null check
- {
- foreach (var marketItem in marketItems)
- {
- if (marketItem.Key == "version")
- continue;
- string[] split = marketItem.Value.ToString().Split('|');
- if (split[0] == s)
- {
- localeName = split.Length > 2 ? split[2] : "";
- break;
- }
- }
- }
- }
-
- return localeName;
+ return GetLocaleNameData(s, true);
}
- private protected static string e = "A?s/,;j_> group, int ak, int bk)
+ ///
+ /// Resolves OCR-specific ambiguities between similar-looking operator names
+ ///
+ /// Current best match
+ /// Candidate alternative
+ /// Original OCR text for disambiguation
+ /// True if the candidate should be preferred over current
+ private bool ResolveOcrAmbiguity(string currentBest, string candidate, string ocrText)
{
- foreach (var entry in group)
- {
- if (entry.Value.Contains(ak) && entry.Value.Contains(bk))
- {
- return true;
- }
- }
+ // Handle Gara/Ivara OCR confusion - these operators have similar visual patterns
+ if (currentBest.StartsWith("Gara") && candidate.StartsWith("Ivara"))
+ return true;
+
+ // Handle Gara/Mesa OCR confusion - garbled "Mesa" (e.g. "Mggga") can tie with "Gara" at same Levenshtein distance
+ // Use first character of OCR text to disambiguate since M and G are visually distinct
+ if (currentBest.StartsWith("Gara") && candidate.StartsWith("Mesa") &&
+ !string.IsNullOrEmpty(ocrText) && ocrText.StartsWith("M", StringComparison.OrdinalIgnoreCase))
+ return true;
+
+ // Future OCR ambiguities can be added here
return false;
}
@@ -1095,30 +990,115 @@ public string GetPartName(string name, out int low, bool suppressLogging, out bo
string lowest_unfiltered = null;
low = 9999;
multipleLowest = false;
- foreach (KeyValuePair prop in nameData)
- {
- int val = LevenshteinDistance(prop.Key, name);
- if (val < low)
- {
- low = val;
- lowest = prop.Value.ToObject();
- lowest_unfiltered = prop.Key;
- multipleLowest = false;
+
+ // For all non-English supported languages - check against localized names directly to avoid expensive conversion
+ if (_settings.Locale != "en")
+ {
+ // Check against localized names in marketItems
+ List> marketItemsSnapshot;
+ var processor = LanguageProcessorFactory.GetCurrentProcessor();
+ string normalizedName = processor.NormalizeForPatternMatching(name);
+
+ // Snapshot minimal data needed under lock
+ lock (marketItemsLock)
+ {
+ if (marketItems != null)
+ {
+ // Check if cached locale matches current locale
+ string cachedLocale = marketItems.TryGetValue("locale", out var localeToken) ? localeToken?.ToString() : null;
+ bool useLocalizedNames = cachedLocale == _settings.Locale;
+
+ marketItemsSnapshot = new List>();
+
+ foreach (var marketItem in marketItems)
+ {
+ if (marketItem.Key == "version") continue;
+ string[] split = marketItem.Value.ToString().Split('|');
+ if (split.Length < 2) continue;
+
+ // Use English name (split[0]) for length comparison regardless of locale cache
+ int englishNameLength = split[0].Length;
+ int lengthDiff = Math.Abs((useLocalizedNames && split.Length >= 3 ? split[2].Length : split[0].Length) - name.Length);
+ if (lengthDiff > Math.Max(englishNameLength, name.Length) / 2) continue;
+
+ // Use localized name only if cache locale matches and available, otherwise fall back to English
+ string comparisonName = useLocalizedNames && split.Length >= 3 ? split[2] : split[0];
+ marketItemsSnapshot.Add(Tuple.Create(split[0], comparisonName, processor.NormalizeForPatternMatching(comparisonName)));
+ }
+ }
+ else
+ {
+ marketItemsSnapshot = new List>();
+ }
}
- else if (val == low)
- {
- multipleLowest = true;
+
+ // Do heavy Levenshtein work outside lock
+ foreach (var item in marketItemsSnapshot)
+ {
+ string englishName = item.Item1;
+ string storedName = item.Item2;
+ string normalizedStored = item.Item3;
+
+ int val = processor.CalculateLevenshteinDistance(normalizedName, normalizedStored);
+
+ // Distance filter: Only accept matches with distance < 50% of string length (like GetLocalizedNameData)
+ if (val >= storedName.Length * 0.5) continue;
+
+ if (val < low)
+ {
+ low = val;
+ lowest = englishName; // Return English name
+ lowest_unfiltered = storedName; // Show localized name in log
+ multipleLowest = false;
+ }
+ else if (val == low)
+ {
+ multipleLowest = true;
+ }
}
-
- if (val == low && lowest.StartsWith("Gara") && prop.Key.StartsWith("Ivara")) //If both
+ }
+ else
+ {
+ // Original logic for English
+ // For English, resolvedName is just the original OCR text
+ string resolvedName = name;
+
+ foreach (KeyValuePair prop in nameData)
{
- lowest = prop.Value.ToObject();
- lowest_unfiltered = prop.Key;
+ int lengthDiff = Math.Abs(prop.Key.Length - name.Length);
+ if (lengthDiff > Math.Max(prop.Key.Length, name.Length) / 2) continue; // Skip if too different in length
+
+ // Resolve OCR text to English for proper comparison (without recursive Levenshtein calls)
+ int val = LevenshteinDistance(prop.Key, resolvedName);
+
+ // Distance filter: Only accept matches with distance < 50% of string length
+ if (val >= prop.Key.Length * 0.5) continue;
+
+ if (val < low)
+ {
+ low = val;
+ lowest = prop.Value.ToObject();
+ lowest_unfiltered = prop.Key;
+ multipleLowest = false;
+ }
+ else if (val == low)
+ {
+ multipleLowest = true;
+ }
+
+ // Handle OCR ambiguity between Gara and Ivara operators
+ // These operators have similar visual patterns that can confuse OCR
+ if (val == low && ResolveOcrAmbiguity(lowest, prop.Key, resolvedName))
+ {
+ lowest = prop.Value.ToObject();
+ lowest_unfiltered = prop.Key;
+ }
}
}
if (!suppressLogging)
Main.AddLog("Found part(" + low + "): \"" + lowest_unfiltered + "\" from \"" + name + "\"");
+
return lowest;
}
@@ -1127,11 +1107,24 @@ public string GetPartNameHuman(string name, out int low)
string lowest = null;
string lowest_unfiltered = null;
low = 9999;
+
+ // Resolve OCR text to English once before loops to avoid repeated expensive database searches
+ // Only resolve for non-English locales to avoid regression in English
+ string resolvedName;
+ if (_settings.Locale == "en")
+ {
+ resolvedName = name; // Use original OCR text for English
+ }
+ else
+ {
+ resolvedName = GetLocaleNameData(name, false) ?? name; // Fallback to original OCR string if resolution fails
+ }
+
foreach (KeyValuePair prop in nameData)
{
if (prop.Value.ToString().ToLower(Main.culture).Contains(name.ToLower(Main.culture)))
{
- int val = LevenshteinDistance(prop.Value.ToString(), name);
+ int val = LevenshteinDistance(prop.Value.ToString(), resolvedName);
if (val < low)
{
low = val;
@@ -1144,7 +1137,7 @@ public string GetPartNameHuman(string name, out int low)
{
foreach (KeyValuePair prop in nameData)
{
- int val = LevenshteinDistance(prop.Value.ToString(), name);
+ int val = LevenshteinDistance(prop.Value.ToString(), resolvedName);
if (val < low)
{
low = val;
@@ -1192,7 +1185,7 @@ public static string GetSetName(string name)
result = result.Replace("hilt", "");
result = result.Replace("link", "");
result = result.TrimEnd();
- result = Main.culture.TextInfo.ToTitleCase(result);
+ result = LanguageProcessorFactory.GetCurrentProcessor().Culture.TextInfo.ToTitleCase(result);
result += " Set";
return result;
}
@@ -1460,7 +1453,7 @@ public static void SetUserAgent(ClientWebSocketOptions options, string userAgent
options.SetRequestHeader("User-Agent", userAgent);
return;
}
- catch (System.ArgumentException ex)
+ catch (System.ArgumentException)
{
//Debug.WriteLine(ex.ToString());
// Fallback to reflection if User-Agent is not settable
diff --git a/WFInfo/LanguageProcessing/ChineseLanguageProcessor.cs b/WFInfo/LanguageProcessing/ChineseLanguageProcessor.cs
new file mode 100644
index 00000000..51eb169c
--- /dev/null
+++ b/WFInfo/LanguageProcessing/ChineseLanguageProcessor.cs
@@ -0,0 +1,146 @@
+using System;
+using System.Text.RegularExpressions;
+using WFInfo.Settings;
+
+namespace WFInfo.LanguageProcessing
+{
+ ///
+ /// Base class for Chinese language processors containing shared behaviors
+ ///
+ public abstract class ChineseLanguageProcessorBase : LanguageProcessor
+ {
+ protected ChineseLanguageProcessorBase(IReadOnlyApplicationSettings settings) : base(settings)
+ {
+ }
+
+ public override string CharacterWhitelist =>
+ string.Concat(GenerateCharacterRangeIterator(0x4E00, 0x7FFF)) +
+ string.Concat(GenerateCharacterRangeIterator(0x8000, 0x9FFF)) +
+ GenerateCharacterRange(0x3400, 0x4DBF) +
+ GenerateCharacterRange(0xF900, 0xFAFF) +
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz "; // Full CJK ideographs
+
+ public override string NormalizeForPatternMatching(string input)
+ {
+ if (string.IsNullOrEmpty(input)) return input;
+
+ // Basic cleanup for Chinese
+ string normalized = input.ToLower(_culture).Trim();
+
+ // Add spaces around "Prime" to match database format better
+ normalized = normalized.Replace("prime", " prime ");
+
+ // Remove accents (not typically needed for Chinese)
+ normalized = RemoveAccents(normalized);
+
+ // Remove extra spaces
+ var parts = normalized.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
+ return string.Join(" ", parts);
+ }
+
+ public override bool IsPartNameValid(string partName)
+ {
+ // Chinese requires minimum of 4 characters after removing spaces
+ return !string.IsNullOrEmpty(partName) && partName.Replace(" ", "").Length >= 4;
+ }
+
+ public override bool ShouldFilterWord(string word)
+ {
+ return FilterWordCore(word);
+ }
+
+ ///
+ /// Shared filtering logic for Chinese word processing
+ ///
+ public static bool FilterWordCore(string word)
+ {
+ if (string.IsNullOrEmpty(word)) return true;
+
+ bool hasCJK = ContainsCJK(word);
+ bool hasLatin = false;
+ foreach (char c in word)
+ {
+ if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
+ {
+ hasLatin = true;
+ break;
+ }
+ }
+
+ // Pure CJK words: keep (even single chars are meaningful in Chinese)
+ if (hasCJK && !hasLatin) return false;
+
+ // Pure Latin words: shortest valid item name component is 3 chars (Ash, Nyx, Mag)
+ // Filter Latin-only words with <= 2 chars ("ll", "ee", "on", "me" = OCR noise from UI)
+ if (hasLatin && !hasCJK) return word.Length <= 2;
+
+ // Mixed Latin+CJK: filter short mixed words (like "G壬") which are OCR garbage
+ // Valid mixed text is always longer (e.g. "Prime" next to CJK is separate words)
+ if (hasCJK && hasLatin && word.Length <= 2) return true;
+
+ // Keep everything else
+ return false;
+ }
+
+ ///
+ /// Checks if a string contains CJK characters
+ ///
+ public static bool ContainsCJK(string text)
+ {
+ foreach (char c in text)
+ {
+ if ((c >= 0x4E00 && c <= 0x9FFF) || (c >= 0x3400 && c <= 0x4DBF) || (c >= 0xF900 && c <= 0xFAFF))
+ return true;
+ }
+ return false;
+ }
+
+ ///
+ /// Normalizes Chinese characters for comparison
+ ///
+ protected static string NormalizeChineseCharacters(string input)
+ {
+ return NormalizeFullWidthCharacters(input).ToLowerInvariant();
+ }
+ }
+
+ ///
+ /// Simplified Chinese language processor for OCR text processing
+ /// Handles Simplified Chinese characters
+ ///
+ public class SimplifiedChineseLanguageProcessor : ChineseLanguageProcessorBase
+ {
+ public SimplifiedChineseLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings)
+ {
+ }
+
+ public override string Locale => "zh-hans";
+
+ public override string[] BlueprintRemovals => new[] { "蓝图", "设计图" };
+
+ public override int CalculateLevenshteinDistance(string s, string t)
+ {
+ return LevenshteinDistanceWithPreprocessing(s, t, BlueprintRemovals, NormalizeChineseCharacters, callBaseDefault: true);
+ }
+ }
+
+ ///
+ /// Traditional Chinese language processor for OCR text processing
+ /// Handles Traditional Chinese characters
+ ///
+ public class TraditionalChineseLanguageProcessor : ChineseLanguageProcessorBase
+ {
+ public TraditionalChineseLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings)
+ {
+ }
+
+ public override string Locale => "zh-hant";
+
+ public override string[] BlueprintRemovals => new[] { "藍圖", "設計圖" };
+
+ public override int CalculateLevenshteinDistance(string s, string t)
+ {
+ return LevenshteinDistanceWithPreprocessing(s, t, BlueprintRemovals, NormalizeChineseCharacters, callBaseDefault: true);
+ }
+ }
+}
diff --git a/WFInfo/LanguageProcessing/CyrillicLanguageProcessor.cs b/WFInfo/LanguageProcessing/CyrillicLanguageProcessor.cs
new file mode 100644
index 00000000..72725be4
--- /dev/null
+++ b/WFInfo/LanguageProcessing/CyrillicLanguageProcessor.cs
@@ -0,0 +1,111 @@
+using System;
+using System.Text.RegularExpressions;
+using WFInfo.Settings;
+
+namespace WFInfo.LanguageProcessing
+{
+ ///
+ /// Russian language processor for OCR text processing
+ /// Handles Russian Cyrillic characters with Latin transliteration
+ ///
+ public class RussianLanguageProcessor : LanguageProcessor
+ {
+ public RussianLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings)
+ {
+ }
+
+ public override string Locale => "ru";
+
+ public override string[] BlueprintRemovals => new string[0]; // No blueprint removals - handled in NormalizeForPatternMatching
+
+ public override string CharacterWhitelist => GenerateCharacterRange(0x0400, 0x04FF) + GenerateCharacterRange(0x0500, 0x052F) + ": "; // Cyrillic + Cyrillic Supplement
+
+ public override int CalculateLevenshteinDistance(string s, string t)
+ {
+ // For Russian, don't normalize Cyrillic to Latin - we want to match Russian to Russian
+ return LevenshteinDistanceWithPreprocessing(s, t, BlueprintRemovals, null);
+ }
+
+ public override string NormalizeForPatternMatching(string input)
+ {
+ if (string.IsNullOrEmpty(input)) return input;
+
+ // Basic cleanup for Russian
+ string normalized = input.ToLower(_culture).Trim();
+
+ // Handle Russian blueprint format: "Чертёж: " -> " (чертеж)"
+ if (normalized.StartsWith("чертёж:") || normalized.StartsWith("чертеж:"))
+ {
+ // Extract item name after "чертёж:" / "чертеж:" with optional whitespace
+ string itemName = Regex.Replace(normalized, @"^черт[её]ж:\s*", "");
+ normalized = itemName + " (чертеж)";
+ }
+
+ // Remove extra spaces
+ var parts = normalized.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
+ return string.Join(" ", parts);
+ }
+
+ public override bool IsPartNameValid(string partName)
+ {
+ // Russian requires minimum of 6 characters after removing spaces
+ return !string.IsNullOrEmpty(partName) && partName.Replace(" ", "").Length >= 6;
+ }
+
+ public override bool ShouldFilterWord(string word)
+ {
+ // Russian filters very short words (less than 2 characters)
+ return !string.IsNullOrEmpty(word) && word.Length < 2;
+ }
+ }
+
+ ///
+ /// Ukrainian language processor for OCR text processing
+ /// Handles Ukrainian Cyrillic characters with Latin transliteration
+ ///
+ public class UkrainianLanguageProcessor : LanguageProcessor
+ {
+ public UkrainianLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings)
+ {
+ }
+
+ public override string Locale => "uk";
+
+ public override string[] BlueprintRemovals => new[] { "Кресленник" };
+
+ public override string CharacterWhitelist => GenerateCharacterRange(0x0400, 0x04FF) + GenerateCharacterRange(0x0500, 0x052F) + ": -()"; // Cyrillic + Cyrillic Supplement
+
+ public override int CalculateLevenshteinDistance(string s, string t)
+ {
+ // For Ukrainian, don't normalize Cyrillic to Latin - we want to match Ukrainian to Ukrainian
+ return LevenshteinDistanceWithPreprocessing(s, t, BlueprintRemovals, null);
+ }
+
+ public override string NormalizeForPatternMatching(string input)
+ {
+ if (string.IsNullOrEmpty(input)) return input;
+
+ // Basic cleanup for Ukrainian
+ string normalized = input.ToLower(_culture).Trim();
+
+ // Remove accents (not typically needed for Ukrainian)
+ //normalized = RemoveAccents(normalized);
+
+ // Remove extra spaces
+ var parts = normalized.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
+ return string.Join(" ", parts);
+ }
+
+ public override bool IsPartNameValid(string partName)
+ {
+ // Ukrainian requires minimum of 6 characters after removing spaces
+ return !string.IsNullOrEmpty(partName) && partName.Replace(" ", "").Length >= 6;
+ }
+
+ public override bool ShouldFilterWord(string word)
+ {
+ // Ukrainian filters very short words (less than 2 characters)
+ return !string.IsNullOrEmpty(word) && word.Length < 2;
+ }
+ }
+}
diff --git a/WFInfo/LanguageProcessing/EnglishLanguageProcessor.cs b/WFInfo/LanguageProcessing/EnglishLanguageProcessor.cs
new file mode 100644
index 00000000..abd3f07a
--- /dev/null
+++ b/WFInfo/LanguageProcessing/EnglishLanguageProcessor.cs
@@ -0,0 +1,55 @@
+using System;
+using System.Text.RegularExpressions;
+using WFInfo.Settings;
+
+namespace WFInfo.LanguageProcessing
+{
+ ///
+ /// English language processor for OCR text processing
+ /// Handles standard English text with basic normalization
+ ///
+ public class EnglishLanguageProcessor : LanguageProcessor
+ {
+ public EnglishLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings)
+ {
+ }
+
+ public override string Locale => "en";
+
+ public override string[] BlueprintRemovals => new[] { "Blueprint" };
+
+ public override string CharacterWhitelist => "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+ public override int CalculateLevenshteinDistance(string s, string t)
+ {
+ return DefaultLevenshteinDistance(s, t);
+ }
+
+ public override string NormalizeForPatternMatching(string input)
+ {
+ if (string.IsNullOrEmpty(input)) return input;
+
+ // Basic cleanup for English
+ string normalized = input.ToLower(_culture).Trim();
+
+ // Add spaces around "Prime" to match database format better
+ normalized = normalized.Replace("prime", " prime ");
+
+ // Remove extra spaces
+ var parts = normalized.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
+ return string.Join(" ", parts);
+ }
+
+ public override bool IsPartNameValid(string partName)
+ {
+ // English requires minimum length of 13 characters
+ return !string.IsNullOrEmpty(partName) && partName.Length >= 13;
+ }
+
+ public override bool ShouldFilterWord(string word)
+ {
+ // English filters very short words (less than 2 characters)
+ return !string.IsNullOrEmpty(word) && word.Length < 2;
+ }
+ }
+}
diff --git a/WFInfo/LanguageProcessing/EuropeanLanguageProcessor.cs b/WFInfo/LanguageProcessing/EuropeanLanguageProcessor.cs
new file mode 100644
index 00000000..517bbdd8
--- /dev/null
+++ b/WFInfo/LanguageProcessing/EuropeanLanguageProcessor.cs
@@ -0,0 +1,244 @@
+using System;
+using System.Text.RegularExpressions;
+using WFInfo.Settings;
+
+namespace WFInfo.LanguageProcessing
+{
+ ///
+ /// Base class for European language processors with common diacritic handling
+ ///
+ public abstract class EuropeanLanguageProcessorBase : LanguageProcessor
+ {
+ protected EuropeanLanguageProcessorBase(IReadOnlyApplicationSettings settings) : base(settings)
+ {
+ }
+
+ public override string NormalizeForPatternMatching(string input)
+ {
+ if (string.IsNullOrEmpty(input)) return input;
+
+ // Basic cleanup for European languages
+ string normalized = input.ToLower(_culture).Trim();
+
+ // Add spaces around "Prime" to match database format better
+ normalized = normalized.Replace("prime", " prime ");
+
+ // Don't remove accents for European languages since database has accented characters
+ // Remove extra spaces
+ var parts = normalized.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
+ return string.Join(" ", parts);
+ }
+
+ public override bool IsPartNameValid(string partName)
+ {
+ // European languages require minimum of 8 characters
+ return !string.IsNullOrEmpty(partName) && partName.Length >= 8;
+ }
+
+ public override bool ShouldFilterWord(string word)
+ {
+ // European languages filter very short words (less than 2 characters)
+ return !string.IsNullOrEmpty(word) && word.Length < 2;
+ }
+
+ public override int CalculateLevenshteinDistance(string s, string t)
+ {
+ return DefaultLevenshteinDistance(s, t);
+ }
+
+ protected override int DefaultLevenshteinDistance(string s, string t)
+ {
+ return LevenshteinDistanceWithPreprocessing(s, t, BlueprintRemovals, input => NormalizeEuropeanCharacters(input), callBaseDefault: true);
+ }
+
+ ///
+ /// Normalizes European characters for comparison
+ ///
+ protected static string NormalizeEuropeanCharacters(string input)
+ {
+ // Convert common European diacritics to standard equivalents for comparison
+ return input.ToLowerInvariant()
+ .Replace('à', 'a').Replace('á', 'a').Replace('â', 'a').Replace('ã', 'a').Replace('ä', 'a').Replace('å', 'a')
+ .Replace('è', 'e').Replace('é', 'e').Replace('ê', 'e').Replace('ë', 'e')
+ .Replace('ì', 'i').Replace('í', 'i').Replace('î', 'i').Replace('ï', 'i')
+ .Replace('ò', 'o').Replace('ó', 'o').Replace('ô', 'o').Replace('õ', 'o').Replace('ö', 'o')
+ .Replace('ù', 'u').Replace('ú', 'u').Replace('û', 'u').Replace('ü', 'u')
+ .Replace('ñ', 'n')
+ .Replace('ç', 'c')
+ .Replace('ÿ', 'y')
+ .Replace('Ç', 'C')
+ .Replace('Ÿ', 'Y');
+ }
+ }
+
+ ///
+ /// German language processor for OCR text processing
+ /// Handles German characters with umlauts
+ ///
+ public class GermanLanguageProcessor : EuropeanLanguageProcessorBase
+ {
+ public GermanLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings)
+ {
+ }
+
+ public override string Locale => "de";
+
+ public override string[] BlueprintRemovals => new[] { "Blaupause", "Plan" };
+
+ public override string CharacterWhitelist => "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz " + GenerateCharacterRange(0x00C4, 0x00C4) + GenerateCharacterRange(0x00D6, 0x00D6) + GenerateCharacterRange(0x00DC, 0x00DC) + GenerateCharacterRange(0x00DF, 0x00DF) + GenerateCharacterRange(0x00E4, 0x00E4) + GenerateCharacterRange(0x00F6, 0x00F6) + GenerateCharacterRange(0x00FC, 0x00FC); // German with umlauts
+ }
+
+ ///
+ /// Spanish language processor for OCR text processing
+ /// Handles Spanish characters with accents and special characters
+ ///
+ public class SpanishLanguageProcessor : EuropeanLanguageProcessorBase
+ {
+ public SpanishLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings)
+ {
+ }
+
+ public override string Locale => "es";
+
+ public override string[] BlueprintRemovals => new[] { "Plano", "Diseño" };
+
+ public override string CharacterWhitelist => "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz " +
+ GenerateCharacterRange(0x00C1, 0x00C1) + // Á
+ GenerateCharacterRange(0x00C9, 0x00C9) + // É
+ GenerateCharacterRange(0x00CD, 0x00CD) + // Í
+ GenerateCharacterRange(0x00D1, 0x00D1) + // Ñ
+ GenerateCharacterRange(0x00D3, 0x00D3) + // Ó
+ GenerateCharacterRange(0x00DA, 0x00DA) + // Ú
+ GenerateCharacterRange(0x00DC, 0x00DC) + // Ü
+ GenerateCharacterRange(0x00E1, 0x00E1) + // á
+ GenerateCharacterRange(0x00E9, 0x00E9) + // é
+ GenerateCharacterRange(0x00ED, 0x00ED) + // í
+ GenerateCharacterRange(0x00F1, 0x00F1) + // ñ
+ GenerateCharacterRange(0x00F3, 0x00F3) + // ó
+ GenerateCharacterRange(0x00FA, 0x00FA) + // ú
+ GenerateCharacterRange(0x00FC, 0x00FC); // ü
+ }
+
+ ///
+ /// Portuguese language processor for OCR text processing
+ /// Handles Portuguese characters with accents and special characters
+ ///
+ public class PortugueseLanguageProcessor : EuropeanLanguageProcessorBase
+ {
+ public PortugueseLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings)
+ {
+ }
+
+ public override string Locale => "pt";
+
+ public override string[] BlueprintRemovals => new[] { "Planta", "Projeto" };
+
+ public override string CharacterWhitelist => "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz " +
+ GenerateCharacterRange(0x00C0, 0x00C0) + // À
+ GenerateCharacterRange(0x00C1, 0x00C1) + // Á
+ GenerateCharacterRange(0x00C2, 0x00C2) + // Â
+ GenerateCharacterRange(0x00C3, 0x00C3) + // Ã
+ GenerateCharacterRange(0x00C7, 0x00C7) + // Ç
+ GenerateCharacterRange(0x00C9, 0x00C9) + // É
+ GenerateCharacterRange(0x00CA, 0x00CA) + // Ê
+ GenerateCharacterRange(0x00CD, 0x00CD) + // Í
+ GenerateCharacterRange(0x00D3, 0x00D3) + // Ó
+ GenerateCharacterRange(0x00D4, 0x00D4) + // Ô
+ GenerateCharacterRange(0x00D5, 0x00D5) + // Õ
+ GenerateCharacterRange(0x00DA, 0x00DA) + // Ú
+ GenerateCharacterRange(0x00DC, 0x00DC) + // Ü
+ GenerateCharacterRange(0x00E0, 0x00E0) + // à
+ GenerateCharacterRange(0x00E1, 0x00E1) + // á
+ GenerateCharacterRange(0x00E2, 0x00E2) + // â
+ GenerateCharacterRange(0x00E3, 0x00E3) + // ã
+ GenerateCharacterRange(0x00E7, 0x00E7) + // ç
+ GenerateCharacterRange(0x00E9, 0x00E9) + // é
+ GenerateCharacterRange(0x00EA, 0x00EA) + // ê
+ GenerateCharacterRange(0x00ED, 0x00ED) + // í
+ GenerateCharacterRange(0x00F3, 0x00F3) + // ó
+ GenerateCharacterRange(0x00F4, 0x00F4) + // ô
+ GenerateCharacterRange(0x00F5, 0x00F5) + // õ
+ GenerateCharacterRange(0x00FA, 0x00FA) + // ú
+ GenerateCharacterRange(0x00FC, 0x00FC); // ü
+ }
+
+ ///
+ /// French language processor for OCR text processing
+ /// Handles French characters with accents and special localization logic
+ ///
+ public class FrenchLanguageProcessor : EuropeanLanguageProcessorBase
+ {
+ public FrenchLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings)
+ {
+ }
+
+ public override string Locale => "fr";
+
+ public override string[] BlueprintRemovals => new[] { "Schéma", "Plan" };
+
+ public override string CharacterWhitelist => "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz " +
+ GenerateCharacterRange(0x00C0, 0x00C0) + // À
+ GenerateCharacterRange(0x00C2, 0x00C2) + // Â
+ GenerateCharacterRange(0x00C6, 0x00C6) + // Æ
+ GenerateCharacterRange(0x00C7, 0x00C7) + // Ç
+ GenerateCharacterRange(0x00C8, 0x00C8) + // È
+ GenerateCharacterRange(0x00C9, 0x00C9) + // É
+ GenerateCharacterRange(0x00CA, 0x00CA) + // Ê
+ GenerateCharacterRange(0x00CB, 0x00CB) + // Ë
+ GenerateCharacterRange(0x00CE, 0x00CE) + // Î
+ GenerateCharacterRange(0x00CF, 0x00CF) + // Ï
+ GenerateCharacterRange(0x00D4, 0x00D4) + // Ô
+ GenerateCharacterRange(0x00D6, 0x00D6) + // Ö
+ GenerateCharacterRange(0x00D9, 0x00D9) + // Ù
+ GenerateCharacterRange(0x00DB, 0x00DB) + // Û
+ GenerateCharacterRange(0x00DC, 0x00DC) + // Ü
+ GenerateCharacterRange(0x00E0, 0x00E0) + // à
+ GenerateCharacterRange(0x00E2, 0x00E2) + // â
+ GenerateCharacterRange(0x00E6, 0x00E6) + // æ
+ GenerateCharacterRange(0x00E7, 0x00E7) + // ç
+ GenerateCharacterRange(0x00E8, 0x00E8) + // è
+ GenerateCharacterRange(0x00E9, 0x00E9) + // é
+ GenerateCharacterRange(0x00EA, 0x00EA) + // ê
+ GenerateCharacterRange(0x00EB, 0x00EB) + // ë
+ GenerateCharacterRange(0x00EE, 0x00EE) + // î
+ GenerateCharacterRange(0x00EF, 0x00EF) + // ï
+ GenerateCharacterRange(0x00F4, 0x00F4) + // ô
+ GenerateCharacterRange(0x00F6, 0x00F6) + // ö
+ GenerateCharacterRange(0x00F9, 0x00F9) + // ù
+ GenerateCharacterRange(0x00FB, 0x00FB) + // û
+ GenerateCharacterRange(0x00FC, 0x00FC); // ü
+ }
+
+ ///
+ /// Italian language processor for OCR text processing
+ /// Handles Italian characters with accents
+ ///
+ public class ItalianLanguageProcessor : EuropeanLanguageProcessorBase
+ {
+ public ItalianLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings)
+ {
+ }
+
+ public override string Locale => "it";
+
+ public override string[] BlueprintRemovals => new[] { "Progetto", "Piano" };
+
+ public override string CharacterWhitelist => "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-()" +
+ GenerateCharacterRange(0x00C0, 0x00C0) + // À
+ GenerateCharacterRange(0x00C8, 0x00C8) + // È
+ GenerateCharacterRange(0x00C9, 0x00C9) + // É
+ GenerateCharacterRange(0x00CC, 0x00CC) + // Ì
+ GenerateCharacterRange(0x00CD, 0x00CD) + // Í
+ GenerateCharacterRange(0x00D2, 0x00D2) + // Ò
+ GenerateCharacterRange(0x00D3, 0x00D3) + // Ó
+ GenerateCharacterRange(0x00D9, 0x00D9) + // Ù
+ GenerateCharacterRange(0x00E0, 0x00E0) + // à
+ GenerateCharacterRange(0x00E8, 0x00E8) + // è
+ GenerateCharacterRange(0x00E9, 0x00E9) + // é
+ GenerateCharacterRange(0x00EC, 0x00EC) + // ì
+ GenerateCharacterRange(0x00ED, 0x00ED) + // í
+ GenerateCharacterRange(0x00F2, 0x00F2) + // ò
+ GenerateCharacterRange(0x00F3, 0x00F3) + // ó
+ GenerateCharacterRange(0x00F9, 0x00F9); // ù
+ }
+}
diff --git a/WFInfo/LanguageProcessing/JapaneseLanguageProcessor.cs b/WFInfo/LanguageProcessing/JapaneseLanguageProcessor.cs
new file mode 100644
index 00000000..3ac16d48
--- /dev/null
+++ b/WFInfo/LanguageProcessing/JapaneseLanguageProcessor.cs
@@ -0,0 +1,229 @@
+using System;
+using System.Collections.Generic;
+using System.Text.RegularExpressions;
+using WFInfo.Settings;
+
+namespace WFInfo.LanguageProcessing
+{
+ ///
+ /// Japanese language processor for OCR text processing
+ /// Handles Japanese Hiragana, Katakana, and Kanji characters
+ ///
+ public class JapaneseLanguageProcessor : LanguageProcessor
+ {
+ public JapaneseLanguageProcessor(IReadOnlyApplicationSettings settings) : base(settings)
+ {
+ }
+
+ public override string Locale => "ja";
+
+ public override string[] BlueprintRemovals => new[] { "設計図", "青図" };
+
+ public override string CharacterWhitelist =>
+ GenerateCharacterRange(0x3040, 0x309F) +
+ GenerateCharacterRange(0x30A0, 0x30FF) +
+ string.Concat(GenerateCharacterRangeIterator(0x4E00, 0x6FFF)) +
+ GenerateCharacterRange(0x7000, 0x7FFF) +
+ GenerateCharacterRange(0x8000, 0x9FAF) +
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz "; // Japanese Hiragana, Katakana, Kanji
+
+ public override int CalculateLevenshteinDistance(string s, string t)
+ {
+ // Check if both inputs contain Japanese characters for Japanese-aware comparison
+ bool sHasJapanese = ContainsJapanese(s);
+ bool tHasJapanese = ContainsJapanese(t);
+
+ if (sHasJapanese && tHasJapanese)
+ {
+ // Japanese-aware path: use original Japanese characters with Japanese similarity logic
+ return CalculateJapaneseAwareDistance(s, t);
+ }
+ else
+ {
+ // Fallback/transliterated path: normalize to Latin equivalents
+ return LevenshteinDistanceWithPreprocessing(s, t, BlueprintRemovals, NormalizeJapaneseCharacters, callBaseDefault: true);
+ }
+ }
+
+ ///
+ /// Calculates Japanese-aware Levenshtein distance with character similarity groups
+ ///
+ private int CalculateJapaneseAwareDistance(string s, string t)
+ {
+ if (string.IsNullOrEmpty(s)) return string.IsNullOrEmpty(t) ? 0 : t.Length;
+ if (string.IsNullOrEmpty(t)) return s.Length;
+
+ int n = s.Length;
+ int m = t.Length;
+
+ if (n == 0) return m;
+ if (m == 0) return n;
+
+ int[,] d = new int[n + 1, m + 1];
+
+ for (int i = 0; i <= n; i++)
+ d[i, 0] = i;
+
+ for (int j = 0; j <= m; j++)
+ d[0, j] = j;
+
+ for (int i = 1; i <= n; i++)
+ {
+ for (int j = 1; j <= m; j++)
+ {
+ int cost = GetJapaneseCharacterDifference(s[i - 1], t[j - 1]);
+ d[i, j] = Math.Min(
+ Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1),
+ d[i - 1, j - 1] + cost);
+ }
+ }
+
+ return d[n, m];
+ }
+
+ ///
+ /// Gets the character difference cost for Japanese characters based on similarity groups
+ ///
+ private int GetJapaneseCharacterDifference(char a, char b)
+ {
+ if (a == b) return 0;
+
+ // Hiragana-Katakana equivalents (lower cost for similar characters)
+ var hiraganaKatakanaPairs = new Dictionary
+ {
+ {'あ', 'ア'}, {'い', 'イ'}, {'う', 'ウ'}, {'え', 'エ'}, {'お', 'オ'},
+ {'か', 'カ'}, {'き', 'キ'}, {'く', 'ク'}, {'け', 'ケ'}, {'こ', 'コ'},
+ {'が', 'ガ'}, {'ぎ', 'ギ'}, {'ぐ', 'グ'}, {'げ', 'ゲ'}, {'ご', 'ゴ'},
+ {'さ', 'サ'}, {'し', 'シ'}, {'す', 'ス'}, {'せ', 'セ'}, {'そ', 'ソ'},
+ {'ざ', 'ザ'}, {'じ', 'ジ'}, {'ず', 'ズ'}, {'ぜ', 'ゼ'}, {'ぞ', 'ゾ'},
+ {'た', 'タ'}, {'ち', 'チ'}, {'つ', 'ツ'}, {'て', 'テ'}, {'と', 'ト'},
+ {'だ', 'ダ'}, {'ぢ', 'ヂ'}, {'づ', 'ヅ'}, {'で', 'デ'}, {'ど', 'ド'},
+ {'な', 'ナ'}, {'に', 'ニ'}, {'ぬ', 'ヌ'}, {'ね', 'ネ'}, {'の', 'ノ'},
+ {'は', 'ハ'}, {'ひ', 'ヒ'}, {'ふ', 'フ'}, {'へ', 'ヘ'}, {'ほ', 'ホ'},
+ {'ば', 'バ'}, {'び', 'ビ'}, {'ぶ', 'ブ'}, {'べ', 'ベ'}, {'ぼ', 'ボ'},
+ {'ぱ', 'パ'}, {'ぴ', 'ピ'}, {'ぷ', 'プ'}, {'ぺ', 'ペ'}, {'ぽ', 'ポ'},
+ {'ま', 'マ'}, {'み', 'ミ'}, {'む', 'ム'}, {'め', 'メ'}, {'も', 'モ'},
+ {'や', 'ヤ'}, {'ゆ', 'ユ'}, {'よ', 'ヨ'},
+ {'ら', 'ラ'}, {'り', 'リ'}, {'る', 'ル'}, {'れ', 'レ'}, {'ろ', 'ロ'},
+ {'わ', 'ワ'}, {'ゐ', 'ヰ'}, {'ゑ', 'ヱ'}, {'を', 'ヲ'}, {'ん', 'ン'},
+ {'っ', 'ッ'}, {'ゃ', 'ャ'}, {'ゅ', 'ュ'}, {'ょ', 'ョ'}
+ };
+
+ // Check if characters are hiragana-katakana equivalents
+ if (hiraganaKatakanaPairs.TryGetValue(a, out var katakanaEquiv) && katakanaEquiv == b)
+ return 1; // Low cost for hiragana-katakana equivalents
+ if (hiraganaKatakanaPairs.TryGetValue(b, out var hiraganaEquiv) && hiraganaEquiv == a)
+ return 1;
+
+ // Similar looking characters (common OCR confusions)
+ var similarChars = new[]
+ {
+ new[] {'シ', 'ツ'}, // shi/tsu confusion
+ new[] {'ソ', 'ン'}, // so/n confusion
+ new[] {'ク', 'ワ'}, // ku/wa confusion
+ new[] {'ヘ', 'へ'}, // he/he (different forms)
+ new[] {'ベ', 'べ'}, // be/be (different forms)
+ new[] {'ヲ', 'ヲ'}, // wo/wo (different forms)
+ new[] {'ヶ', 'ケ'}, // ke/ke variation
+ new[] {'ヵ', 'カ'}, // ka/ka variation
+ };
+
+ foreach (var pair in similarChars)
+ {
+ if ((a == pair[0] && b == pair[1]) || (a == pair[1] && b == pair[0]))
+ return 1; // Low cost for similar looking characters
+ }
+
+ // Default cost for different characters
+ return 2;
+ }
+
+ public override string NormalizeForPatternMatching(string input)
+ {
+ if (string.IsNullOrEmpty(input)) return input;
+
+ // Apply Japanese-specific normalization first
+ string normalized = NormalizeJapaneseCharacters(input);
+
+ // Basic cleanup for Japanese
+ normalized = normalized.ToLower(_culture).Trim();
+
+ // Add spaces around "Prime" to match database format better
+ normalized = normalized.Replace("prime", " prime ");
+
+ // Remove accents (not typically needed for Japanese - preserve combining marks)
+ // normalized = RemoveAccents(normalized);
+
+ // Remove extra spaces
+ var parts = normalized.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
+ return string.Join(" ", parts);
+ }
+
+ public override bool IsPartNameValid(string partName)
+ {
+ // Japanese requires minimum of 4 characters after removing spaces
+ return !string.IsNullOrEmpty(partName) && partName.Replace(" ", "").Length >= 4;
+ }
+
+
+ public override bool ShouldFilterWord(string word)
+ {
+ if (string.IsNullOrEmpty(word)) return true;
+
+ bool hasJapanese = ContainsJapanese(word);
+ bool hasLatin = false;
+ foreach (char c in word)
+ {
+ if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
+ {
+ hasLatin = true;
+ break;
+ }
+ }
+
+ // Keep all Japanese text (Hiragana/Katakana/Kanji characters) since Japanese words are meaningful
+ // even when split by OCR
+ if (hasJapanese) return false;
+
+ // For mixed Japanese-Latin words, be more lenient
+ if (hasJapanese && hasLatin) return false;
+
+ // For non-Japanese text, use standard filtering (filter very short words)
+ return word.Length < 2;
+ }
+
+ ///
+ /// Checks if a string contains Japanese characters (Hiragana, Katakana, or Kanji)
+ ///
+ private static bool ContainsJapanese(string input)
+ {
+ foreach (char c in input)
+ {
+ // Hiragana (0x3040-0x309F)
+ if (c >= 0x3040 && c <= 0x309F) return true;
+ // Katakana (0x30A0-0x30FF)
+ if (c >= 0x30A0 && c <= 0x30FF) return true;
+ // Kanji (0x4E00-0x9FAF)
+ if (c >= 0x4E00 && c <= 0x9FAF) return true;
+ }
+ return false;
+ }
+
+ ///
+ /// Normalizes Japanese characters for comparison
+ ///
+ private static string NormalizeJapaneseCharacters(string input)
+ {
+ string result = NormalizeFullWidthCharacters(input);
+
+ // Normalize katakana/hiragana variations and common OCR confusions
+ result = result.Replace('ヶ', 'ケ').Replace('ヵ', 'カ');
+ result = result.Replace('゙', '゛').Replace('゚', '゜'); // Handakuten and Dakuten normalization
+
+ // Common katakana OCR confusions
+ result = result.Replace('ヲ', 'ヲ').Replace('ヮ', 'ワ').Replace('ヰ', 'イ').Replace('ヱ', 'エ').Replace('ヲ', 'オ');
+
+ return result.ToLowerInvariant();
+ }
+ }
+}
diff --git a/WFInfo/LanguageProcessing/KoreanLanguageProcessor.cs b/WFInfo/LanguageProcessing/KoreanLanguageProcessor.cs
new file mode 100644
index 00000000..12a46cb0
--- /dev/null
+++ b/WFInfo/LanguageProcessing/KoreanLanguageProcessor.cs
@@ -0,0 +1,494 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text.RegularExpressions;
+using WFInfo.Settings;
+
+namespace WFInfo.LanguageProcessing
+{
+ ///
+ /// Korean language processor for OCR text processing
+ /// Handles Korean Hangul characters with special normalization rules
+ ///
+ public class KoreanLanguageProcessor : LanguageProcessor
+ {
+
+ // Static spacing corrections to avoid recreating dictionary on every call
+ private static readonly Dictionary spacingCorrections = new Dictionary
+ {
+ {" ", " "}, {" ", " "}, {" ", " "}
+ };
+
+ // Static Korean character replacements to avoid recreating list on every call
+ private static readonly List> koreanReplacements = new List>
+ {
+ // Basic consonants and vowels
+ new KeyValuePair("가", "ga"), new KeyValuePair("개", "gae"), new KeyValuePair("갸", "gya"), new KeyValuePair("걔", "gyae"), new KeyValuePair("거", "geo"), new KeyValuePair("게", "ge"), new KeyValuePair("겨", "gyeo"), new KeyValuePair("계", "gye"),
+ new KeyValuePair("고", "go"), new KeyValuePair("과", "gwa"), new KeyValuePair("궈", "gwo"), new KeyValuePair("괘", "gwae"), new KeyValuePair("괴", "goe"), new KeyValuePair("교", "gyo"), new KeyValuePair("구", "gu"),
+ new KeyValuePair("궤", "gwe"), new KeyValuePair("귀", "gwi"), new KeyValuePair("규", "gyu"), new KeyValuePair("그", "geu"), new KeyValuePair("긔", "gui"), new KeyValuePair("기", "gi"),
+
+ new KeyValuePair("나", "na"), new KeyValuePair("내", "nae"), new KeyValuePair("냐", "nya"), new KeyValuePair("냬", "nyae"), new KeyValuePair("너", "neo"), new KeyValuePair("네", "ne"), new KeyValuePair("녀", "nyeo"), new KeyValuePair("녜", "nye"),
+ new KeyValuePair("노", "no"), new KeyValuePair("놔", "nwa"), new KeyValuePair("놰", "nwo"), new KeyValuePair("뇌", "noe"), new KeyValuePair("뇨", "nyo"), new KeyValuePair("누", "nu"), new KeyValuePair("뉘", "nwi"),
+ new KeyValuePair("뉴", "nyu"), new KeyValuePair("느", "neu"), new KeyValuePair("늬", "nui"), new KeyValuePair("니", "ni"),
+
+ new KeyValuePair("다", "da"), new KeyValuePair("대", "dae"), new KeyValuePair("댜", "dya"), new KeyValuePair("댸", "dyae"), new KeyValuePair("더", "deo"), new KeyValuePair("데", "de"), new KeyValuePair("뎌", "dyeo"), new KeyValuePair("뎨", "dye"),
+ new KeyValuePair("도", "do"), new KeyValuePair("돠", "dwa"), new KeyValuePair("돼", "dwae"), new KeyValuePair("됴", "dyo"), new KeyValuePair("두", "du"), new KeyValuePair("둬", "dwo"), new KeyValuePair("뒈", "dwae"),
+ new KeyValuePair("뒤", "dwi"), new KeyValuePair("듀", "dyu"), new KeyValuePair("드", "deu"), new KeyValuePair("듸", "dui"), new KeyValuePair("디", "di"),
+
+ new KeyValuePair("라", "ra"), new KeyValuePair("래", "rae"), new KeyValuePair("랴", "rya"), new KeyValuePair("럐", "ryae"), new KeyValuePair("러", "reo"), new KeyValuePair("레", "re"), new KeyValuePair("려", "ryeo"), new KeyValuePair("례", "rye"),
+ new KeyValuePair("로", "ro"), new KeyValuePair("롸", "rwa"), new KeyValuePair("뢔", "roe"), new KeyValuePair("료", "ryo"), new KeyValuePair("루", "ru"), new KeyValuePair("뤄", "rwo"), new KeyValuePair("뤠", "rwae"), new KeyValuePair("뤼", "rwi"),
+ new KeyValuePair("류", "ryu"), new KeyValuePair("르", "reu"), new KeyValuePair("릐", "rui"), new KeyValuePair("리", "ri"),
+
+ new KeyValuePair("마", "ma"), new KeyValuePair("매", "mae"), new KeyValuePair("먀", "mya"), new KeyValuePair("먜", "myae"), new KeyValuePair("머", "meo"), new KeyValuePair("메", "me"), new KeyValuePair("며", "myeo"), new KeyValuePair("몌", "mye"),
+ new KeyValuePair("모", "mo"), new KeyValuePair("뫄", "mwa"), new KeyValuePair("뫠", "mwae"), new KeyValuePair("뫼", "moe"), new KeyValuePair("묘", "myo"), new KeyValuePair("무", "mu"), new KeyValuePair("뭐", "mwo"), new KeyValuePair("뭬", "mwae"),
+ new KeyValuePair("뮈", "mwi"), new KeyValuePair("뮤", "myu"), new KeyValuePair("므", "meu"), new KeyValuePair("믜", "mui"), new KeyValuePair("미", "mi"),
+
+ new KeyValuePair("바", "ba"), new KeyValuePair("배", "bae"), new KeyValuePair("뱌", "bya"), new KeyValuePair("뱨", "byae"), new KeyValuePair("버", "beo"), new KeyValuePair("베", "be"), new KeyValuePair("벼", "byeo"), new KeyValuePair("볘", "bye"),
+ new KeyValuePair("보", "bo"), new KeyValuePair("봐", "bwa"), new KeyValuePair("봬", "bwae"), new KeyValuePair("뵈", "boe"), new KeyValuePair("뵤", "byo"), new KeyValuePair("부", "bu"), new KeyValuePair("붜", "bwo"), new KeyValuePair("붸", "bwae"),
+ new KeyValuePair("뷔", "bwi"), new KeyValuePair("뷰", "byu"), new KeyValuePair("브", "beu"), new KeyValuePair("븨", "bui"), new KeyValuePair("비", "bi"),
+
+ new KeyValuePair("사", "sa"), new KeyValuePair("새", "sae"), new KeyValuePair("샤", "sya"), new KeyValuePair("섀", "syae"), new KeyValuePair("서", "seo"), new KeyValuePair("세", "se"), new KeyValuePair("셔", "syeo"), new KeyValuePair("셰", "sye"),
+ new KeyValuePair("소", "so"), new KeyValuePair("솨", "swa"), new KeyValuePair("쇄", "swae"), new KeyValuePair("쇠", "soe"), new KeyValuePair("쇼", "syo"), new KeyValuePair("수", "su"), new KeyValuePair("숴", "swo"), new KeyValuePair("쉐", "swae"),
+ new KeyValuePair("쉬", "swi"), new KeyValuePair("슈", "syu"), new KeyValuePair("스", "seu"), new KeyValuePair("싀", "sui"), new KeyValuePair("시", "si"),
+
+ new KeyValuePair("아", "a"), new KeyValuePair("애", "ae"), new KeyValuePair("야", "ya"), new KeyValuePair("얘", "yae"), new KeyValuePair("어", "eo"), new KeyValuePair("에", "e"), new KeyValuePair("여", "yeo"), new KeyValuePair("예", "ye"),
+ new KeyValuePair("오", "o"), new KeyValuePair("와", "wa"), new KeyValuePair("왜", "wae"), new KeyValuePair("외", "oe"), new KeyValuePair("요", "yo"), new KeyValuePair("우", "u"), new KeyValuePair("워", "wo"), new KeyValuePair("웨", "we"),
+ new KeyValuePair("위", "wi"), new KeyValuePair("유", "yu"), new KeyValuePair("으", "eu"), new KeyValuePair("의", "ui"), new KeyValuePair("이", "i"),
+
+ new KeyValuePair("자", "ja"), new KeyValuePair("재", "jae"), new KeyValuePair("쟈", "jya"), new KeyValuePair("쟤", "jyae"), new KeyValuePair("저", "jeo"), new KeyValuePair("제", "je"), new KeyValuePair("져", "jyeo"), new KeyValuePair("졔", "jye"),
+ new KeyValuePair("조", "jo"), new KeyValuePair("좌", "jwa"), new KeyValuePair("좨", "jwae"), new KeyValuePair("죄", "joe"), new KeyValuePair("죠", "jyo"), new KeyValuePair("주", "ju"), new KeyValuePair("줘", "jwo"), new KeyValuePair("줴", "jwae"),
+ new KeyValuePair("쥐", "jwi"), new KeyValuePair("쥬", "jyu"), new KeyValuePair("즈", "jeu"), new KeyValuePair("즤", "jui"), new KeyValuePair("지", "ji"),
+
+ new KeyValuePair("차", "cha"), new KeyValuePair("채", "chae"), new KeyValuePair("챠", "chya"), new KeyValuePair("챼", "chyae"), new KeyValuePair("처", "cheo"), new KeyValuePair("체", "che"), new KeyValuePair("쳐", "chyeo"), new KeyValuePair("쳬", "chye"),
+ new KeyValuePair("초", "cho"), new KeyValuePair("촤", "chwa"), new KeyValuePair("쵀", "chwae"), new KeyValuePair("최", "choe"), new KeyValuePair("쵸", "chyo"), new KeyValuePair("추", "chu"), new KeyValuePair("춰", "chwo"), new KeyValuePair("췌", "chwae"),
+ new KeyValuePair