From e11d7b1467711ff3d1b3f614ba838a4c8d373436 Mon Sep 17 00:00:00 2001 From: Jon Thysell Date: Thu, 17 Jul 2025 21:46:14 -0700 Subject: [PATCH] Switch to using QuickDict --- CHANGELOG.md | 5 + LICENSE.md | 2 +- README.md | 2 +- src/Directory.Build.props | 2 +- src/HawDict.sln | 4 +- src/HawDict/HawDict.csproj | 2 + src/HawDict/Input/InputDictBase.cs | 168 ++++++++++---- src/HawDict/Input/MamakaKaiaoInputDict.cs | 93 ++++---- src/HawDict/Input/PlaceNamesInputDict.cs | 39 ++-- src/HawDict/Input/PukuiElbertInputDict.cs | 245 ++++++++++---------- src/HawDict/Output/OutputAbbreviation.cs | 64 ------ src/HawDict/Output/OutputArticle.cs | 258 ---------------------- src/HawDict/Output/OutputDictBase.cs | 44 ---- src/HawDict/Output/StarDictDictionary.cs | 196 ---------------- src/HawDict/Output/XdxfDictionary.cs | 167 -------------- src/HawDict/StringUtils.cs | 48 ---- 16 files changed, 333 insertions(+), 1006 deletions(-) delete mode 100644 src/HawDict/Output/OutputAbbreviation.cs delete mode 100644 src/HawDict/Output/OutputArticle.cs delete mode 100644 src/HawDict/Output/OutputDictBase.cs delete mode 100644 src/HawDict/Output/StarDictDictionary.cs delete mode 100644 src/HawDict/Output/XdxfDictionary.cs diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c6f786..7499220 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # HawDict ChangeLog # +## next ## + +* Switch to using QuickDict for StarDict and XDXF creation +* Fixed --format filtering issue + ## v0.17.3 ## * Fixed build break diff --git a/LICENSE.md b/LICENSE.md index 262264b..91e2cf9 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2018-2024 Jon Thysell +Copyright (c) 2018-2025 Jon Thysell Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index a2431af..a7e99d7 100644 --- a/README.md +++ b/README.md @@ -71,4 +71,4 @@ HawDict is open-source under the MIT license. HawDict does not include any copyrighted dictionary data - it is just a converter. HawDict uses the [Html Agility Pack](https://github.com/zzzprojects/html-agility-pack/) to download and parse the dictionary data from [Ulukau](https://ulukau.org/) at runtime. All dictionary data (terms, definitions, etc) is copyright their respective copyright owners. -HawDict Copyright (c) 2018-2024 Jon Thysell +HawDict Copyright (c) 2018-2025 Jon Thysell diff --git a/src/Directory.Build.props b/src/Directory.Build.props index ebb7b48..3075966 100644 --- a/src/Directory.Build.props +++ b/src/Directory.Build.props @@ -3,7 +3,7 @@ HawDict Jon Thysell Jon Thysell - Copyright © 2018-2024 Jon Thysell + Copyright © 2018-2025 Jon Thysell LICENSE.md https://github.com/jonthysell/HawDict net6.0 diff --git a/src/HawDict.sln b/src/HawDict.sln index 5485712..1393bbd 100644 --- a/src/HawDict.sln +++ b/src/HawDict.sln @@ -1,7 +1,7 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 16 -VisualStudioVersion = 16.0.31112.23 +# Visual Studio Version 17 +VisualStudioVersion = 17.14.36301.6 MinimumVisualStudioVersion = 10.0.40219.1 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "HawDict", "HawDict\HawDict.csproj", "{50C2D599-98A1-496B-8E0D-561071C54BE5}" EndProject diff --git a/src/HawDict/HawDict.csproj b/src/HawDict/HawDict.csproj index 13f914b..2d48c85 100644 --- a/src/HawDict/HawDict.csproj +++ b/src/HawDict/HawDict.csproj @@ -5,7 +5,9 @@ HawDict HawDict + + diff --git a/src/HawDict/Input/InputDictBase.cs b/src/HawDict/Input/InputDictBase.cs index 3fbc372..9091228 100644 --- a/src/HawDict/Input/InputDictBase.cs +++ b/src/HawDict/Input/InputDictBase.cs @@ -6,6 +6,9 @@ using System.IO; using System.Linq; using System.Text; +using System.Text.RegularExpressions; + +using QuickDict; namespace HawDict { @@ -15,9 +18,9 @@ namespace HawDict public enum OutputFormats { None, - CleanTxt, - StarDict, - Xdxf, + CleanTxt = 0x1, + StarDict = 0x2, + Xdxf = 0x4, All = CleanTxt + StarDict + Xdxf, } @@ -79,12 +82,16 @@ public void Process(string rootDir, OutputFormats outputFormats = OutputFormats. if (outputFormats.HasFlag(OutputFormats.StarDict)) { - SaveOutputDict(); + string starDictFile = Path.Combine(DictDir, $"{ID}.{TranslationType}.StarDict.ifo"); + SaveStarDictFile(starDictFile); } if (outputFormats.HasFlag(OutputFormats.Xdxf)) { - SaveOutputDict(); + string xdxfFile = Path.Combine(DictDir, $"{ID}.{TranslationType}.dict.xdxf"); + SaveXdxfFile(xdxfFile); + + Log("Building XDXF dictionary."); } Log("Save end."); @@ -128,57 +135,144 @@ private void SaveCleanFile(string cleanPath) Log("Saved {0} entries.", count); } - protected abstract void GetRawDataFromSource(); - - protected abstract IEnumerable> GetCleanEntries(); - - private void AddArticles(OutputDictBase dict) + private DictionaryMetadata GetMetadata() { - dict.Articles.AddRange(GetCleanEntries().Select(kvp => new OutputArticle(dict, kvp.Key, kvp.Value))); + var metadata = new DictionaryMetadata(); + + metadata.ShortTitle = ShortTitle; + metadata.LongTitle = LongTitle; + metadata.Description = Description; + metadata.Authors.AddRange(Authors); + metadata.SrcUrl = SrcUrl; + metadata.ArticleKeyLangCode = TranslationType == TranslationType.EngToHaw ? "ENG" : "HAW"; + metadata.ArticleValueLangCode = TranslationType == TranslationType.EngToHaw ? "HAW" : "ENG"; + metadata.FileVersion = AppInfo.Version; + + return metadata; } - private void SaveOutputDict() where T : OutputDictBase + private void SaveStarDictFile(string starDictPath) { - OutputDictBase outputDict = GetOutputDict(); + var dict = new StarDictDictionary(GetMetadata()); - Log("Building {0} dictionary.", outputDict.FormatType); - AddArticles(outputDict); + dict.GetStarDictSynonymsFromArticle = a => + { + HashSet synonyms = new HashSet + { + a.Key + }; - Log("Saving {0} dictionary.", outputDict.FormatType); - outputDict.Save(DictDir); - } + foreach (string key in a.Key.Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries)) + { + string s = key.Replace(StringUtils.SyllableDotUtf8, "").Replace(".", "").Replace("*", "").Replace("-", ""); + + synonyms.Add(s); + synonyms.Add(StringUtils.ReplaceOkina(s)); + synonyms.Add(StringUtils.ReplaceOkina(s, "")); + synonyms.Add(StringUtils.RemoveDiacritics(s)); + synonyms.Add(StringUtils.ReplaceOkina(StringUtils.RemoveDiacritics(s))); + synonyms.Add(StringUtils.ReplaceOkina(StringUtils.RemoveDiacritics(s), "")); + } - private OutputDictBase GetOutputDict() where T : OutputDictBase - { - OutputDictBase dict = null; + return synonyms; + }; - if (typeof(T) == typeof(XdxfDictionary)) + dict.GetValueFromArticle = a => { - dict = new XdxfDictionary(ID, TranslationType) + var valueSB = new StringBuilder(); + + foreach (var definition in a.Value.GetDefinitions(true)) { - Title = ShortTitle, - FullTitle = LongTitle, - Description = Description, - SrcUrl = SrcUrl, - }; + var value = definition; + // Add abbreviations + foreach (var abbreviation in a.Parent.Abbreviations) + { + value = value.WrapInTag(abbreviation.Key, "i", StringWrapInTagOptions.WrapWholeWordsOnly); + if (abbreviation.Key.Length > 1 && char.IsLower(abbreviation.Key[0])) + { + value = value.WrapInTag(char.ToUpper(abbreviation.Key[0]) + abbreviation.Key.Substring(1), "i", StringWrapInTagOptions.WrapWholeWordsOnly); + } + } + + value = value.WrapInTag("p"); + + // Add bold for numbering + value = Regex.Replace(value, "

([0-9]+)\\. ", "

$1. "); + if (value.Contains("2. ")) + { + // Fix bolding number one for pre-text + value = Regex.Replace(value, "

(.*[^>])1\\. ", "

$11. "); + } + + valueSB.Append(value); + } + + return valueSB.ToString(); + }; + + Log("Building StarDict dictionary."); + + foreach (var kvp in GetCleanEntries()) + { + dict.AddArticle(kvp.Key, kvp.Value); } - else if (typeof(T) == typeof(StarDictDictionary)) + + AddAbbreviations(dict); + + Log("Saving StarDict dictionary."); + + dict.Save(starDictPath); + } + + private void SaveXdxfFile(string xdxfPath) + { + var dict = new XdxfDictionary(GetMetadata()); + + dict.GetXdxfKeysFromAbbreviation = a => { - dict = new StarDictDictionary(ID, TranslationType) + var list = new List() { a.Key }; + if (a.Key.Length > 1 && char.IsLower(a.Key[0])) { - Title = LongTitle, - Description = Description, - }; - } + list.Add(char.ToUpper(a.Key[0]) + a.Key.Substring(1)); + } + return list; + }; - dict.Authors.AddRange(Authors); + dict.GetXdxfKeysFromArticle = a => + { + return a.Key.Split(',', StringSplitOptions.RemoveEmptyEntries).ToList(); + }; + + dict.GetXdxfKeyOptionalTerms = () => + { + return new HashSet() { ".", StringUtils.SyllableDotUtf8 }; + }; + + dict.GetXdxfValuesFromArticle = a => + { + return a.Value.GetDefinitions(false).ToList(); + }; + + Log("Building XDXF dictionary."); + + foreach (var kvp in GetCleanEntries()) + { + dict.AddArticle(kvp.Key, kvp.Value); + } AddAbbreviations(dict); - return dict; + Log("Saving XDXF dictionary."); + + dict.Save(xdxfPath); } - protected abstract void AddAbbreviations(OutputDictBase dict); + protected abstract void GetRawDataFromSource(); + + protected abstract IEnumerable> GetCleanEntries(); + + protected abstract void AddAbbreviations(DictionaryBase dict); + } public enum TranslationType diff --git a/src/HawDict/Input/MamakaKaiaoInputDict.cs b/src/HawDict/Input/MamakaKaiaoInputDict.cs index 330e943..1b695dd 100644 --- a/src/HawDict/Input/MamakaKaiaoInputDict.cs +++ b/src/HawDict/Input/MamakaKaiaoInputDict.cs @@ -6,6 +6,8 @@ using HtmlAgilityPack; +using QuickDict; + namespace HawDict { public class MamakaKaiaoInputDict : HtmlInputDict @@ -251,7 +253,7 @@ protected override string[] ParseEntryNode(HtmlNode node) { string entryName = node.FirstChild.OuterHtml; string entryValue = node.InnerHtml.Remove(0, entryName.Length); - + try { return new string[] { StringUtils.NormalizeWhiteSpace(StringUtils.SingleLineNoTabs(entryName)), StringUtils.NormalizeWhiteSpace(StringUtils.SingleLineNoTabs(entryValue)) }; @@ -269,53 +271,50 @@ protected override string FinalCleanValue(string value) return StringUtils.FixSentenceSpacing(value); } - protected override void AddAbbreviations(OutputDictBase dict) + protected override void AddAbbreviations(DictionaryBase dict) { - dict.Abbreviations.AddRange(new OutputAbbreviation[] - { - new OutputAbbreviation(dict, "abb.", "abbreviation"), - new OutputAbbreviation(dict, "Bib.", "Bible"), - new OutputAbbreviation(dict, "cf.", "compare", AbbreviationType.Auxiliary), - new OutputAbbreviation(dict, "comb.", "combined form"), - new OutputAbbreviation(dict, "dic.", "dictionary definition"), - new OutputAbbreviation(dict, "e.g.", "for example", AbbreviationType.Auxiliary), - new OutputAbbreviation(dict, "Eng.", "English"), - new OutputAbbreviation(dict, "ext. mng.", "extended meaning"), - new OutputAbbreviation(dict, "i.e.", "that is", AbbreviationType.Auxiliary), - new OutputAbbreviation(dict, "inv.", "invention"), - new OutputAbbreviation(dict, "Japn.", "Japanese"), - new OutputAbbreviation(dict, "lit.", "literally"), - new OutputAbbreviation(dict, "mān.", "mānaleo (native speaker)"), - new OutputAbbreviation(dict, "new mng.", "new meaning"), - new OutputAbbreviation(dict, "PPN", "Proto Polynesian"), - new OutputAbbreviation(dict, "redup.", "reduplication"), - new OutputAbbreviation(dict, "sh.", "shortened form"), - new OutputAbbreviation(dict, "sp. var.", "spelling variation"), - new OutputAbbreviation(dict, "Tah.", "Tahitian"), - new OutputAbbreviation(dict, "trad.", "traditional literary sources"), - new OutputAbbreviation(dict, "var.", "variation"), - new OutputAbbreviation(dict, "ham", "hamani (transitive verb)", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "heh", "hehele (intransitive verb)", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "ʻaʻ", "ʻaʻano (stative verb)", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "kik", "kikino (common noun)", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "iʻoa", "iʻoa (proper noun)", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "EK", "Elama Kanahele"), - new OutputAbbreviation(dict, "HA", "Henry Auwae"), - new OutputAbbreviation(dict, "HHLH", "Helen Haleola Lee Hong"), - new OutputAbbreviation(dict, "HKM", "Harry Kunihi Mitchell"), - new OutputAbbreviation(dict, "JPM", "Joseph Puipui Makaai"), - new OutputAbbreviation(dict, "KKK", "Kaui Keola Keamoai"), - new OutputAbbreviation(dict, "LK", "Louise Keliihoomalu"), - new OutputAbbreviation(dict, "MMLH", "Martha Manoanoa Lum Ho"), - new OutputAbbreviation(dict, "MW", "Minnie Whitford"), - new OutputAbbreviation(dict, "Anatomia", "Judd, Gerrit P. Anatomia"), - new OutputAbbreviation(dict, "Bihopa", "Bihopa, E. A. Haawina Mua o ka Hoailona Helu"), - new OutputAbbreviation(dict, "Bounty", "HeMoolelo no na Luina Kipi o ka Moku Bounty"), - new OutputAbbreviation(dict, "Legendre", "Legendre, A. M. Ke Anahonua"), - new OutputAbbreviation(dict, "Judd", "Judd et al. Hawaiian Language Imprints, 1822-1899"), - new OutputAbbreviation(dict, "Pakaa", "Nakuina, Moses K. Pakaa a me Ku-a-Pakaa"), - new OutputAbbreviation(dict, "Wilcox", "Wilcox, Robert"), - }); + dict.AddAbbreviation("abb.", "abbreviation"); + dict.AddAbbreviation("Bib.", "Bible"); + dict.AddAbbreviation("cf.", "compare", AbbreviationType.Auxiliary); + dict.AddAbbreviation("comb.", "combined form"); + dict.AddAbbreviation("dic.", "dictionary definition"); + dict.AddAbbreviation("e.g.", "for example", AbbreviationType.Auxiliary); + dict.AddAbbreviation("Eng.", "English"); + dict.AddAbbreviation("ext. mng.", "extended meaning"); + dict.AddAbbreviation("i.e.", "that is", AbbreviationType.Auxiliary); + dict.AddAbbreviation("inv.", "invention"); + dict.AddAbbreviation("Japn.", "Japanese"); + dict.AddAbbreviation("lit.", "literally"); + dict.AddAbbreviation("mān.", "mānaleo (native speaker)"); + dict.AddAbbreviation("new mng.", "new meaning"); + dict.AddAbbreviation("PPN", "Proto Polynesian"); + dict.AddAbbreviation("redup.", "reduplication"); + dict.AddAbbreviation("sh.", "shortened form"); + dict.AddAbbreviation("sp. var.", "spelling variation"); + dict.AddAbbreviation("Tah.", "Tahitian"); + dict.AddAbbreviation("trad.", "traditional literary sources"); + dict.AddAbbreviation("var.", "variation"); + dict.AddAbbreviation("ham", "hamani (transitive verb)", AbbreviationType.Grammatical); + dict.AddAbbreviation("heh", "hehele (intransitive verb)", AbbreviationType.Grammatical); + dict.AddAbbreviation("ʻaʻ", "ʻaʻano (stative verb)", AbbreviationType.Grammatical); + dict.AddAbbreviation("kik", "kikino (common noun)", AbbreviationType.Grammatical); + dict.AddAbbreviation("iʻoa", "iʻoa (proper noun)", AbbreviationType.Grammatical); + dict.AddAbbreviation("EK", "Elama Kanahele"); + dict.AddAbbreviation("HA", "Henry Auwae"); + dict.AddAbbreviation("HHLH", "Helen Haleola Lee Hong"); + dict.AddAbbreviation("HKM", "Harry Kunihi Mitchell"); + dict.AddAbbreviation("JPM", "Joseph Puipui Makaai"); + dict.AddAbbreviation("KKK", "Kaui Keola Keamoai"); + dict.AddAbbreviation("LK", "Louise Keliihoomalu"); + dict.AddAbbreviation("MMLH", "Martha Manoanoa Lum Ho"); + dict.AddAbbreviation("MW", "Minnie Whitford"); + dict.AddAbbreviation("Anatomia", "Judd, Gerrit P. Anatomia"); + dict.AddAbbreviation("Bihopa", "Bihopa, E. A. Haawina Mua o ka Hoailona Helu"); + dict.AddAbbreviation("Bounty", "HeMoolelo no na Luina Kipi o ka Moku Bounty"); + dict.AddAbbreviation("Legendre", "Legendre, A. M. Ke Anahonua"); + dict.AddAbbreviation("Judd", "Judd et al. Hawaiian Language Imprints, 1822-1899"); + dict.AddAbbreviation("Pakaa", "Nakuina, Moses K. Pakaa a me Ku-a-Pakaa"); + dict.AddAbbreviation("Wilcox", "Wilcox, Robert"); } } } diff --git a/src/HawDict/Input/PlaceNamesInputDict.cs b/src/HawDict/Input/PlaceNamesInputDict.cs index 65ba678..e6f5042 100644 --- a/src/HawDict/Input/PlaceNamesInputDict.cs +++ b/src/HawDict/Input/PlaceNamesInputDict.cs @@ -6,6 +6,8 @@ using HtmlAgilityPack; +using QuickDict; + namespace HawDict { public class PlaceNamesInputDict : HtmlInputDict @@ -91,27 +93,24 @@ protected override string FinalCleanValue(string value) return StringUtils.FixSentenceSpacing(value); } - protected override void AddAbbreviations(OutputDictBase dict) + protected override void AddAbbreviations(DictionaryBase dict) { - dict.Abbreviations.AddRange(new OutputAbbreviation[] - { - new OutputAbbreviation(dict, "For. Sel.", "Elbert, Selections from Fornander"), - new OutputAbbreviation(dict, "For.", "Fornander, Hawaiian Antiquities (e.g., For. 5:176 means Fornander, Volume 5, p. 176)"), - new OutputAbbreviation(dict, "HM", "Beckwith, Hawaiian Mythology"), - new OutputAbbreviation(dict, "Indices", "Indices of Awards..."), - new OutputAbbreviation(dict, "Kuy. 1", "Kuykendall, The Hawaiian Kingdom, Volume 1"), - new OutputAbbreviation(dict, "Kuy. 2", "Kuykendall, The Hawaiian Kingdom, Volume 2"), - new OutputAbbreviation(dict, "Kuy. 3", "Kuykendall, The Hawaiian Kingdom, Volume 3"), - new OutputAbbreviation(dict, "lit.", "literally"), - new OutputAbbreviation(dict, "PE", "Pukui and Elbert, Hawaiian Dictionary"), - new OutputAbbreviation(dict, "PH", "Emerson, Pele and Hiiaka"), - new OutputAbbreviation(dict, "qd.", "quadrangle"), - new OutputAbbreviation(dict, "qds.", "quadrangles (maps 2-4)"), - new OutputAbbreviation(dict, "RC", "Ruling Chiefs"), - new OutputAbbreviation(dict, "TM", "Taylor and Miranda, \"Honolulu Street Names\""), - new OutputAbbreviation(dict, "UL", "Emerson, Unwritten Literature..."), - new OutputAbbreviation(dict, "*", "Pronunciation and meaning uncertain"), - }); + dict.AddAbbreviation("For. Sel.", "Elbert, Selections from Fornander"); + dict.AddAbbreviation("For.", "Fornander, Hawaiian Antiquities (e.g., For. 5:176 means Fornander, Volume 5, p. 176)"); + dict.AddAbbreviation("HM", "Beckwith, Hawaiian Mythology"); + dict.AddAbbreviation("Indices", "Indices of Awards..."); + dict.AddAbbreviation("Kuy. 1", "Kuykendall, The Hawaiian Kingdom, Volume 1"); + dict.AddAbbreviation("Kuy. 2", "Kuykendall, The Hawaiian Kingdom, Volume 2"); + dict.AddAbbreviation("Kuy. 3", "Kuykendall, The Hawaiian Kingdom, Volume 3"); + dict.AddAbbreviation("lit.", "literally"); + dict.AddAbbreviation("PE", "Pukui and Elbert, Hawaiian Dictionary"); + dict.AddAbbreviation("PH", "Emerson, Pele and Hiiaka"); + dict.AddAbbreviation("qd.", "quadrangle"); + dict.AddAbbreviation("qds.", "quadrangles (maps 2-4)"); + dict.AddAbbreviation("RC", "Ruling Chiefs"); + dict.AddAbbreviation("TM", "Taylor and Miranda, \"Honolulu Street Names\""); + dict.AddAbbreviation("UL", "Emerson, Unwritten Literature..."); + dict.AddAbbreviation("*", "Pronunciation and meaning uncertain"); } } } diff --git a/src/HawDict/Input/PukuiElbertInputDict.cs b/src/HawDict/Input/PukuiElbertInputDict.cs index 7bee6f9..ca2bbc3 100644 --- a/src/HawDict/Input/PukuiElbertInputDict.cs +++ b/src/HawDict/Input/PukuiElbertInputDict.cs @@ -6,6 +6,8 @@ using HtmlAgilityPack; +using QuickDict; + namespace HawDict { public class PukuiElbertInputDict : HtmlInputDict @@ -37,12 +39,13 @@ protected override string CleanSourceHtml(string s) { // Remove header comments s = Regex.Replace(s, "

In causative/simulative forms beginning with.*\n", ""); - return s + s = s .Replace("

 

\n
", "") .Replace("

 

\n

 

\n
", "") .Replace("&4 ", "Redup. ").Replace("&;n", "n.").Replace("&(PCP; ", "(PCP ").Replace("(Mele. ", "(Mele ") .Replace("..", ".").Replace("..", ".").Replace("..", ".") .Replace("“", "\"").Replace("”", "\"") + .Replace(" ,", ",") .Replace("T.44>", "") .Replace("h3", "span") // Typo fixes: @@ -165,7 +168,7 @@ protected override string CleanSourceHtml(string s) .Replace("Ka-pū,lehu", "Ka-pū.lehu") .Replace("ā…paha", "ā … paha") // Typos with _ - .Replace("Na_na_", "Nānā") + .Replace("Na_na_", "Nānā.") .Replace(">Palaki ʻan_ai", ">Palaki ʻānai") .Replace("Pal_aha", "Pālaha") .Replace(">ka mea an_a", ">ka mea āna") @@ -181,9 +184,14 @@ protected override string CleanSourceHtml(string s) .Replace(">Ma kahi maikaʻi e paʻawela ana n_", ">Ma kahi maikaʻi e paʻawela ana nō") .Replace(">p_u.ʻulu kaua ", ">pū.ʻulu kaua ") .Replace(">Kō wai kaʻa k_elā?", ">Kō wai kaʻa kēlā?") + .Replace("A spindly banana . . ,", "A spindly banana …,") // Missing definition number fixes .Replace("

n. Name of a large valley on", "

1. n. Name of a large valley on") ; + // Fix Nānā references + s = Regex.Replace(s, @"Nānā;? (\d)", @"Nānā. $1"); + s = Regex.Replace(s, @"Nānā;? (\d)", @"Nānā. $1"); + return s; } protected override bool IsEntryNode(HtmlNode node) @@ -206,125 +214,122 @@ protected override string FinalCleanValue(string value) return StringUtils.FixSentenceSpacing(value); } - protected override void AddAbbreviations(OutputDictBase dict) + protected override void AddAbbreviations(DictionaryBase dict) { - dict.Abbreviations.AddRange(new OutputAbbreviation[] - { - new OutputAbbreviation(dict, "And.", "Andrews dictionary, 1865; reference is given only if no evidence is available other than that in Andrews and Andrews-Parker (AP)"), - new OutputAbbreviation(dict, "AP", "Andrews-Parker dictionary, 1922; reference is given only if no evidence is available other than that in Andrews (And.) and Andrews-Parker"), - new OutputAbbreviation(dict, "Cap.", "beginning with a capital letter"), - new OutputAbbreviation(dict, "caus/sim.", "causative/simulative", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "cf.", "compare", AbbreviationType.Auxiliary), - new OutputAbbreviation(dict, "conj.", "conjunction", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "demon.", "demonstrative", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "Eng.", "word borrowed from English"), - new OutputAbbreviation(dict, "ex.", "example, examples", AbbreviationType.Auxiliary), - new OutputAbbreviation(dict, "f.", "form (in names of plants)"), - new OutputAbbreviation(dict, "fig.", "figuratively"), - new OutputAbbreviation(dict, "For.", "Fornander, Hawaiian Antiquities (For. 4:297 = Fornander Vol. 4, p. 297)"), - new OutputAbbreviation(dict, "FS", "Elbert, Selections from Fornander"), - new OutputAbbreviation(dict, "GP", "Green and Pukui, Legend of Kawelo"), - new OutputAbbreviation(dict, "Gr.", "word probably borrowed from Greek"), - new OutputAbbreviation(dict, "Gram.", "Elbert and Pukui, Hawaiian Grammar"), - new OutputAbbreviation(dict, "Heb.", "word probably borrowed from Hebrew"), - new OutputAbbreviation(dict, "HM", "Beckwith, Hawaiian Mythology"), - new OutputAbbreviation(dict, "HP", "Handy, Hawaiian Planter"), - new OutputAbbreviation(dict, "Ii", "Ii, Fragments of Hawaiian History"), - new OutputAbbreviation(dict, "interr.", "interrogative", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "interj.", "interjection", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "Kam. 1964", "Kamakau, Ka Poʻe Kahiko"), - new OutputAbbreviation(dict, "Kam. 1976", "Kamakau, The Works of the People of Old"), - new OutputAbbreviation(dict, "Kel.", "Kelekona, Kaluaikoolau"), - new OutputAbbreviation(dict, "Kep.", "Beckwith, Kepelino"), - new OutputAbbreviation(dict, "KJV", "King James Version of the Bible"), - new OutputAbbreviation(dict, "KL.", "Beckwith, Kumulipo"), - new OutputAbbreviation(dict, "Laie", "Beckwith, Laieikawai"), - new OutputAbbreviation(dict, "lit.", "literally"), - new OutputAbbreviation(dict, "loc.n.", "locative noun", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "Malo", "Malo, Hawaiian Antiquities, 1951"), - new OutputAbbreviation(dict, "MK", "Ke Alanui o ka Lani, Oia ka Manuale Kakolika"), - new OutputAbbreviation(dict, "n.v.", "noun-verb", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "n.", "noun", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "Nak.", "Nakuina, Moolelo Hawaii ..."), - new OutputAbbreviation(dict, "Nānā", "Pukui, Haertig, Lee, Nānā i ke Kumu"), - new OutputAbbreviation(dict, "Neal", "Neal, In Gardens of Hawaii, 1965"), - new OutputAbbreviation(dict, "num.", "numeral", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "par.", "particle", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "pas/imp.", "passive/imperative", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "PH", "Emerson, Pele and Hiiaka"), - new OutputAbbreviation(dict, "pl.", "plural", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "PCP", "Proto Central Polynesian"), - new OutputAbbreviation(dict, "PEP", "Proto East Polynesian"), - new OutputAbbreviation(dict, "PNP", "Proto Nuclear Polynesian"), - new OutputAbbreviation(dict, "poss.", "possessive", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "PPN", "Proto Polynesian"), - new OutputAbbreviation(dict, "prep.", "preposition", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "RC", "Kamakau, Ruling Chiefs"), - new OutputAbbreviation(dict, "redup.", "reduplication (for meanings of reduplications, see Gram. 6.2.2)"), - new OutputAbbreviation(dict, "RSV", "Holy Bible, Revised Standard Version"), - new OutputAbbreviation(dict, "sg.", "singular", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "sp., spp.", "species"), - new OutputAbbreviation(dict, "TC", "Taro Collection"), - new OutputAbbreviation(dict, "UL", "Emerson, Unwritten Literature"), - new OutputAbbreviation(dict, "v.", "verb", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "var.", "variant, variety"), - new OutputAbbreviation(dict, "nvi.", "noun-intransitive verb", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "nvs.", "noun-stative verb", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "nvt.", "noun-transitive verb", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "vi.", "intransitive verb", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "vs.", "stative verb", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "vt.", "transitive verb", AbbreviationType.Grammatical), - new OutputAbbreviation(dict, "Am.", "Amosa (Amos)"), - new OutputAbbreviation(dict, "Dan.", "Daniela (Daniel)"), - new OutputAbbreviation(dict, "Epeso", "(Ephesians)"), - new OutputAbbreviation(dict, "Eset.", "Esetera (Esther)"), - new OutputAbbreviation(dict, "Ezek.", "Ezekiela (Ezekiel)"), - new OutputAbbreviation(dict, "Ezera", "(Ezra)"), - new OutputAbbreviation(dict, "Gal.", "Galatia (Galatians)"), - new OutputAbbreviation(dict, "Hagai", "(Haggai)"), - new OutputAbbreviation(dict, "Hal.", "Halelu (Psalms)"), - new OutputAbbreviation(dict, "Heb.", "Hebera (Hebrews)"), - new OutputAbbreviation(dict, "Hoik.", "Hoikeana (Revelation)"), - new OutputAbbreviation(dict, "Hos.", "Hosea (Hosea)"), - new OutputAbbreviation(dict, "Iak.", "Iakobo (James)"), - new OutputAbbreviation(dict, "Ier.", "Ieremia (Jeremiah)"), - new OutputAbbreviation(dict, "Ioane", "(John)"), - new OutputAbbreviation(dict, "Ioba", "(Job)"), - new OutputAbbreviation(dict, "Ioela", "(Joel)"), - new OutputAbbreviation(dict, "Ios.", "Iosua (Joshua)"), - new OutputAbbreviation(dict, "Isa.", "Isaia (Isaiah)"), - new OutputAbbreviation(dict, "Iuda", "(Jude)"), - new OutputAbbreviation(dict, "Kanl.", "Kanawailua (Deuteronomy)"), - new OutputAbbreviation(dict, "Kekah.", "Kekahuna (Ecclesiastes)"), - new OutputAbbreviation(dict, "Kin.", "Kinohi (Genesis)"), - new OutputAbbreviation(dict, "Kol.", "Kolosa (Colosians)"), - new OutputAbbreviation(dict, "Kor.", "Korineto (Corinthians)"), - new OutputAbbreviation(dict, "Luka", "(Luke)"), - new OutputAbbreviation(dict, "Lunk.", "Lunakanawai (Judges)"), - new OutputAbbreviation(dict, "Mal.", "Malaki (Malachi)"), - new OutputAbbreviation(dict, "Mar.", "Mareko (Mark)"), - new OutputAbbreviation(dict, "Mat.", "Mataio (Matthew)"), - new OutputAbbreviation(dict, "Mele", "Mele a Solomona (Songs of Solomon)"), - new OutputAbbreviation(dict, "Mika", "(Micah)"), - new OutputAbbreviation(dict, "Nah.", "Nahelu (Numbers)"), - new OutputAbbreviation(dict, "Nal.", "Nalii (Kings)"), - new OutputAbbreviation(dict, "Neh.", "Nehemia (Nehemia)"), - new OutputAbbreviation(dict, "Oih.", "Oihana (Acts)"), - new OutputAbbreviation(dict, "Oihk.", "Oihanakahuna (Leviticus)"), - new OutputAbbreviation(dict, "Oihn.", "Oihanaalii (Chronicles)"), - new OutputAbbreviation(dict, "Pet.", "Petero (Peter)"), - new OutputAbbreviation(dict, "Pilipi", "(Philippians)"), - new OutputAbbreviation(dict, "Puk.", "Pukaana (Exodus)"), - new OutputAbbreviation(dict, "Roma", "(Romans)"), - new OutputAbbreviation(dict, "Ruta", "(Ruth)"), - new OutputAbbreviation(dict, "Sam.", "Samuela (Samuel)"), - new OutputAbbreviation(dict, "Sol.", "Solomona (Proverbs)"), - new OutputAbbreviation(dict, "Tes.", "Tesalonike (Thessalonians)"), - new OutputAbbreviation(dict, "Tim.", "Timoteo (Timothy)"), - new OutputAbbreviation(dict, "Tito", "(Titus)"), - new OutputAbbreviation(dict, "Zek.", "Zekaria (Zechariah)"), - new OutputAbbreviation(dict, "Zep.", "Zepania (Zephaniah)"), - }); + dict.AddAbbreviation("And.", "Andrews dictionary, 1865; reference is given only if no evidence is available other than that in Andrews and Andrews-Parker (AP)"); + dict.AddAbbreviation("AP", "Andrews-Parker dictionary, 1922; reference is given only if no evidence is available other than that in Andrews (And.) and Andrews-Parker"); + dict.AddAbbreviation("Cap.", "beginning with a capital letter"); + dict.AddAbbreviation("caus/sim.", "causative/simulative", AbbreviationType.Grammatical); + dict.AddAbbreviation("cf.", "compare", AbbreviationType.Auxiliary); + dict.AddAbbreviation("conj.", "conjunction", AbbreviationType.Grammatical); + dict.AddAbbreviation("demon.", "demonstrative", AbbreviationType.Grammatical); + dict.AddAbbreviation("Eng.", "word borrowed from English"); + dict.AddAbbreviation("ex.", "example, examples", AbbreviationType.Auxiliary); + dict.AddAbbreviation("f.", "form (in names of plants)"); + dict.AddAbbreviation("fig.", "figuratively"); + dict.AddAbbreviation("For.", "Fornander, Hawaiian Antiquities (For. 4:297 = Fornander Vol. 4, p. 297)"); + dict.AddAbbreviation("FS", "Elbert, Selections from Fornander"); + dict.AddAbbreviation("GP", "Green and Pukui, Legend of Kawelo"); + dict.AddAbbreviation("Gr.", "word probably borrowed from Greek"); + dict.AddAbbreviation("Gram.", "Elbert and Pukui, Hawaiian Grammar"); + dict.AddAbbreviation("Heb.", "word probably borrowed from Hebrew"); + dict.AddAbbreviation("HM", "Beckwith, Hawaiian Mythology"); + dict.AddAbbreviation("HP", "Handy, Hawaiian Planter"); + dict.AddAbbreviation("Ii", "Ii, Fragments of Hawaiian History"); + dict.AddAbbreviation("interr.", "interrogative", AbbreviationType.Grammatical); + dict.AddAbbreviation("interj.", "interjection", AbbreviationType.Grammatical); + dict.AddAbbreviation("Kam. 1964", "Kamakau, Ka Poʻe Kahiko"); + dict.AddAbbreviation("Kam. 1976", "Kamakau, The Works of the People of Old"); + dict.AddAbbreviation("Kel.", "Kelekona, Kaluaikoolau"); + dict.AddAbbreviation("Kep.", "Beckwith, Kepelino"); + dict.AddAbbreviation("KJV", "King James Version of the Bible"); + dict.AddAbbreviation("KL.", "Beckwith, Kumulipo"); + dict.AddAbbreviation("Laie", "Beckwith, Laieikawai"); + dict.AddAbbreviation("lit.", "literally"); + dict.AddAbbreviation("loc.n.", "locative noun", AbbreviationType.Grammatical); + dict.AddAbbreviation("Malo", "Malo, Hawaiian Antiquities, 1951"); + dict.AddAbbreviation("MK", "Ke Alanui o ka Lani, Oia ka Manuale Kakolika"); + dict.AddAbbreviation("n.v.", "noun-verb", AbbreviationType.Grammatical); + dict.AddAbbreviation("n.", "noun", AbbreviationType.Grammatical); + dict.AddAbbreviation("Nak.", "Nakuina, Moolelo Hawaii ..."); + dict.AddAbbreviation("Nānā.", "Pukui, Haertig, Lee, Nānā i ke Kumu"); + dict.AddAbbreviation("Neal", "Neal, In Gardens of Hawaii, 1965"); + dict.AddAbbreviation("num.", "numeral", AbbreviationType.Grammatical); + dict.AddAbbreviation("par.", "particle", AbbreviationType.Grammatical); + dict.AddAbbreviation("pas/imp.", "passive/imperative", AbbreviationType.Grammatical); + dict.AddAbbreviation("PH", "Emerson, Pele and Hiiaka"); + dict.AddAbbreviation("pl.", "plural", AbbreviationType.Grammatical); + dict.AddAbbreviation("PCP", "Proto Central Polynesian"); + dict.AddAbbreviation("PEP", "Proto East Polynesian"); + dict.AddAbbreviation("PNP", "Proto Nuclear Polynesian"); + dict.AddAbbreviation("poss.", "possessive", AbbreviationType.Grammatical); + dict.AddAbbreviation("PPN", "Proto Polynesian"); + dict.AddAbbreviation("prep.", "preposition", AbbreviationType.Grammatical); + dict.AddAbbreviation("RC", "Kamakau, Ruling Chiefs"); + dict.AddAbbreviation("redup.", "reduplication (for meanings of reduplications, see Gram. 6.2.2)"); + dict.AddAbbreviation("RSV", "Holy Bible, Revised Standard Version"); + dict.AddAbbreviation("sg.", "singular", AbbreviationType.Grammatical); + dict.AddAbbreviation("sp., spp.", "species"); + dict.AddAbbreviation("TC", "Taro Collection"); + dict.AddAbbreviation("UL", "Emerson, Unwritten Literature"); + dict.AddAbbreviation("v.", "verb", AbbreviationType.Grammatical); + dict.AddAbbreviation("var.", "variant, variety"); + dict.AddAbbreviation("nvi.", "noun-intransitive verb", AbbreviationType.Grammatical); + dict.AddAbbreviation("nvs.", "noun-stative verb", AbbreviationType.Grammatical); + dict.AddAbbreviation("nvt.", "noun-transitive verb", AbbreviationType.Grammatical); + dict.AddAbbreviation("vi.", "intransitive verb", AbbreviationType.Grammatical); + dict.AddAbbreviation("vs.", "stative verb", AbbreviationType.Grammatical); + dict.AddAbbreviation("vt.", "transitive verb", AbbreviationType.Grammatical); + dict.AddAbbreviation("Am.", "Amosa (Amos)"); + dict.AddAbbreviation("Dan.", "Daniela (Daniel)"); + dict.AddAbbreviation("Epeso", "(Ephesians)"); + dict.AddAbbreviation("Eset.", "Esetera (Esther)"); + dict.AddAbbreviation("Ezek.", "Ezekiela (Ezekiel)"); + dict.AddAbbreviation("Ezera", "(Ezra)"); + dict.AddAbbreviation("Gal.", "Galatia (Galatians)"); + dict.AddAbbreviation("Hagai", "(Haggai)"); + dict.AddAbbreviation("Hal.", "Halelu (Psalms)"); + dict.AddAbbreviation("Heb.", "Hebera (Hebrews)"); + dict.AddAbbreviation("Hoik.", "Hoikeana (Revelation)"); + dict.AddAbbreviation("Hos.", "Hosea (Hosea)"); + dict.AddAbbreviation("Iak.", "Iakobo (James)"); + dict.AddAbbreviation("Ier.", "Ieremia (Jeremiah)"); + dict.AddAbbreviation("Ioane", "(John)"); + dict.AddAbbreviation("Ioba", "(Job)"); + dict.AddAbbreviation("Ioela", "(Joel)"); + dict.AddAbbreviation("Ios.", "Iosua (Joshua)"); + dict.AddAbbreviation("Isa.", "Isaia (Isaiah)"); + dict.AddAbbreviation("Iuda", "(Jude)"); + dict.AddAbbreviation("Kanl.", "Kanawailua (Deuteronomy)"); + dict.AddAbbreviation("Kekah.", "Kekahuna (Ecclesiastes)"); + dict.AddAbbreviation("Kin.", "Kinohi (Genesis)"); + dict.AddAbbreviation("Kol.", "Kolosa (Colosians)"); + dict.AddAbbreviation("Kor.", "Korineto (Corinthians)"); + dict.AddAbbreviation("Luka", "(Luke)"); + dict.AddAbbreviation("Lunk.", "Lunakanawai (Judges)"); + dict.AddAbbreviation("Mal.", "Malaki (Malachi)"); + dict.AddAbbreviation("Mar.", "Mareko (Mark)"); + dict.AddAbbreviation("Mat.", "Mataio (Matthew)"); + dict.AddAbbreviation("Mele", "Mele a Solomona (Songs of Solomon)"); + dict.AddAbbreviation("Mika", "(Micah)"); + dict.AddAbbreviation("Nah.", "Nahelu (Numbers)"); + dict.AddAbbreviation("Nal.", "Nalii (Kings)"); + dict.AddAbbreviation("Neh.", "Nehemia (Nehemia)"); + dict.AddAbbreviation("Oih.", "Oihana (Acts)"); + dict.AddAbbreviation("Oihk.", "Oihanakahuna (Leviticus)"); + dict.AddAbbreviation("Oihn.", "Oihanaalii (Chronicles)"); + dict.AddAbbreviation("Pet.", "Petero (Peter)"); + dict.AddAbbreviation("Pilipi", "(Philippians)"); + dict.AddAbbreviation("Puk.", "Pukaana (Exodus)"); + dict.AddAbbreviation("Roma", "(Romans)"); + dict.AddAbbreviation("Ruta", "(Ruth)"); + dict.AddAbbreviation("Sam.", "Samuela (Samuel)"); + dict.AddAbbreviation("Sol.", "Solomona (Proverbs)"); + dict.AddAbbreviation("Tes.", "Tesalonike (Thessalonians)"); + dict.AddAbbreviation("Tim.", "Timoteo (Timothy)"); + dict.AddAbbreviation("Tito", "(Titus)"); + dict.AddAbbreviation("Zek.", "Zekaria (Zechariah)"); + dict.AddAbbreviation("Zep.", "Zepania (Zephaniah)"); } } } diff --git a/src/HawDict/Output/OutputAbbreviation.cs b/src/HawDict/Output/OutputAbbreviation.cs deleted file mode 100644 index 37f8d99..0000000 --- a/src/HawDict/Output/OutputAbbreviation.cs +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) Jon Thysell -// Licensed under the MIT License. - -using System; - -namespace HawDict -{ - public class OutputAbbreviation - { - public OutputDictBase OutputDict { get; private set; } - - public string Key { get; private set; } = null; - public string Value { get; private set; } = null; - - public AbbreviationType AbbreviationType { get; private set; } = AbbreviationType.None; - - public string XdxfKey - { - get - { - string key = StringUtils.WrapInTag(StringUtils.EscapeForXml(Key), "abbr_k"); - - if (char.IsLower(Key[0]) && Key.Length > 1) - { - key += StringUtils.WrapInTag(StringUtils.EscapeForXml(char.ToUpper(Key[0]) + Key.Substring(1)), "abbr_k"); - } - - return key; - } - } - - public string XdxfValue - { - get - { - string value = StringUtils.EscapeForXml(Value); - - value = StringUtils.WrapInTag(value, "abbr_v"); - - return value; - } - } - - public OutputAbbreviation(OutputDictBase dict, string key, string value, AbbreviationType abbreviationType = AbbreviationType.None) - { - OutputDict = dict ?? throw new ArgumentNullException(nameof(dict)); - - Key = !string.IsNullOrWhiteSpace(key) ? key.Trim() : throw new ArgumentNullException(nameof(key)); - Value = !string.IsNullOrWhiteSpace(value) ? value.Trim() : throw new ArgumentNullException(nameof(value)); - - AbbreviationType = abbreviationType; - } - } - - public enum AbbreviationType - { - None, - Grammatical, - Stylistic, - Knowledge, - Auxiliary, - Other - } -} diff --git a/src/HawDict/Output/OutputArticle.cs b/src/HawDict/Output/OutputArticle.cs deleted file mode 100644 index 416184a..0000000 --- a/src/HawDict/Output/OutputArticle.cs +++ /dev/null @@ -1,258 +0,0 @@ -// Copyright (c) Jon Thysell -// Licensed under the MIT License. - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text.RegularExpressions; - -namespace HawDict -{ - public class OutputArticle - { - public OutputDictBase OutputDict { get; private set; } - - public string Key { get; private set; } = null; - public string Value { get; private set; } = null; - - public string XdxfKey - { - get - { - string xdxfKey = ""; - foreach (string key in Key.Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries)) - { - string rawKey = StringUtils.EscapeForXml(key); - xdxfKey += GetXdxfKey(rawKey); - } - - return xdxfKey; - } - } - - public string XdxfValue - { - get - { - return GetXdxfValue(); - } - } - - public string StarDictKey - { - get - { - return Key; - } - } - - public IEnumerable StarDictKeySynonyms - { - get - { - if (_starDictKeySynonyms is null) - { - _starDictKeySynonyms = new HashSet - { - StarDictKey - }; - - foreach (string key in StarDictKey.Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries)) - { - foreach (string synonym in GetSynonyms(key.Trim())) - { - _starDictKeySynonyms.Add(synonym); - } - } - - _starDictKeySynonyms.Remove(StarDictKey); - } - return _starDictKeySynonyms; - } - } - private HashSet _starDictKeySynonyms; - - public string StarDictValue - { - get - { - string value = GetXdxfValue(true); - - value = value - .Replace("", "").Replace("", ""); - - value = value - .Replace("", "").Replace("", "") - .Replace("", "

").Replace("", "

"); - - value = Regex.Replace(value, "

([0-9]+)\\. ", "

$1. "); - - if (value.Contains("2. ")) - { - // Fix bolding number one for pre-text - value = Regex.Replace(value, "

(.*[^>])1\\. ", "

$11. "); - } - - return value; - } - } - - public OutputArticle(OutputDictBase dict, string key, string value) - { - OutputDict = dict ?? throw new ArgumentNullException(nameof(dict)); - - Key = !string.IsNullOrWhiteSpace(key) ? key.Trim() : throw new ArgumentNullException(nameof(key)); - Value = !string.IsNullOrWhiteSpace(value) ? value.Trim() : throw new ArgumentNullException(nameof(value)); - } - - private static string GetXdxfKey(string key) - { - key = StringUtils.WrapInTag(key, StringUtils.SyllableDotUtf8, "opt"); - key = StringUtils.WrapInTag(key, ".", "opt"); - - key = StringUtils.WrapInTag(key, "k"); - - return key; - } - - private string GetXdxfValue(bool keepDefinitionNumbers = false) - { - string value = StringUtils.EscapeForXml(Value); - - // Add abbreviation tags - foreach (OutputAbbreviation abbreviation in OutputDict.Abbreviations) - { - value = AddXdxfAbbreviationTags(value, abbreviation.Key); - - if (char.IsLower(abbreviation.Key[0]) && abbreviation.Key.Length > 1) - { - value = AddXdxfAbbreviationTags(value, char.ToUpper(abbreviation.Key[0]) + abbreviation.Key.Substring(1)); - } - } - - IEnumerable definitions = GetDefinitions(value, keepDefinitionNumbers); - - if (definitions.Count() > 1) - { - value = string.Join("", definitions); - value = $"{value}"; - } - else - { - value = $"{value}"; - } - - return value; - } - - private static IEnumerable GetDefinitions(string value, bool keepDefinitionNumbers, int num = 1) - { - string numStr = $"{num}. "; - string nextNumStr = $" {num + 1}. "; - - int foundIndex = value.IndexOf(numStr); - int nextFoundIndex = value.IndexOf(nextNumStr, foundIndex + 1); - - if (num == 1 && foundIndex > 0 && nextFoundIndex > 0) - { - // Numbered definition with some pre-text - if (keepDefinitionNumbers) - { - yield return value.Substring(0, nextFoundIndex); - } - else - { - yield return value[0..foundIndex] + value[(foundIndex + numStr.Length)..nextFoundIndex]; - } - } - else if (foundIndex == 0 && nextFoundIndex > 0) - { - // Numbered definition without pre-text - if (keepDefinitionNumbers) - { - yield return value[0..nextFoundIndex]; - } - else - { - yield return value[numStr.Length..nextFoundIndex]; - } - } - else if (foundIndex == 0) - { - // Last numbered definition - if (keepDefinitionNumbers) - { - yield return value; - } - else - { - yield return value.Substring(numStr.Length); - } - } - else - { - // No numbers, just one definition - yield return value; - } - - if (nextFoundIndex > 0) - { - foreach (string def in GetDefinitions(value.Substring(nextFoundIndex + 1), keepDefinitionNumbers, num + 1)) - { - yield return def; - } - } - } - - private static string AddXdxfAbbreviationTags(string value, string abbreviation) - { - value = value.Replace($" {abbreviation} ", $" {abbreviation} "); - value = value.Replace($"({abbreviation} ", $"({abbreviation} "); - value = value.Replace($" {abbreviation})", $" {abbreviation})"); - value = value.Replace($"({abbreviation})", $"({abbreviation})"); - - value = value.Replace($"{abbreviation}.", $"{abbreviation}."); - - value = value.Replace($"{abbreviation};", $"{abbreviation};"); - - value = value.Replace($"{abbreviation},", $"{abbreviation},"); - value = value.Replace($"({abbreviation},", $"({abbreviation},"); - - value = value.Replace($"{abbreviation}/", $"{abbreviation}/"); - value = value.Replace($"/{abbreviation}", $"/{abbreviation}"); - - value = value.Replace($"—{abbreviation}", $"—{abbreviation} "); - - if (value.StartsWith(abbreviation + " ")) - { - value = $"{abbreviation}{value.Substring(abbreviation.Length)}"; - } - - if (value.EndsWith(" " + abbreviation)) - { - value = $"{value.Substring(0, value.Length - abbreviation.Length)}{abbreviation}"; - } - - return value; - } - - private static HashSet GetSynonyms(string key) - { - HashSet synonyms = new HashSet - { - key - }; - - string s = key.Replace(StringUtils.SyllableDotUtf8, "").Replace(".", "").Replace("*", "").Replace("-", ""); - - synonyms.Add(s); - synonyms.Add(StringUtils.ReplaceOkina(s)); - synonyms.Add(StringUtils.ReplaceOkina(s, "")); - synonyms.Add(StringUtils.RemoveDiacritics(s)); - synonyms.Add(StringUtils.ReplaceOkina(StringUtils.RemoveDiacritics(s))); - synonyms.Add(StringUtils.ReplaceOkina(StringUtils.RemoveDiacritics(s), "")); - - return synonyms; - } - } -} diff --git a/src/HawDict/Output/OutputDictBase.cs b/src/HawDict/Output/OutputDictBase.cs deleted file mode 100644 index 688d959..0000000 --- a/src/HawDict/Output/OutputDictBase.cs +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) Jon Thysell -// Licensed under the MIT License. - -using System; -using System.Collections.Generic; - -namespace HawDict -{ - public abstract class OutputDictBase - { - public string ID { get; private set; } - - public string FormatType { get; private set; } - - public TranslationType TranslationType { get; private set; } - - #region MetaData - - public string Title { get; set; } = null; - - public string Description { get; set; } = null; - - public DateTime CreationDateTime { get; private set; } = DateTime.UtcNow; - - public List Authors { get; private set; } = new List(); - - public static string FileVersion => AppInfo.Version; - - #endregion - - public List Articles { get; private set; } = new List(); - - public List Abbreviations { get; private set; } = new List(); - - public OutputDictBase(string id, string formatType, TranslationType translationType) - { - ID = !string.IsNullOrWhiteSpace(id) ? id : throw new ArgumentNullException(nameof(id)); - FormatType = !string.IsNullOrWhiteSpace(formatType) ? formatType : throw new ArgumentNullException(nameof(formatType)); - TranslationType = translationType; - } - - public abstract void Save(string dictDir); - } -} diff --git a/src/HawDict/Output/StarDictDictionary.cs b/src/HawDict/Output/StarDictDictionary.cs deleted file mode 100644 index 2396f2a..0000000 --- a/src/HawDict/Output/StarDictDictionary.cs +++ /dev/null @@ -1,196 +0,0 @@ -// Copyright (c) Jon Thysell -// Licensed under the MIT License. - -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; - -namespace HawDict -{ - public class StarDictDictionary : OutputDictBase - { - private static readonly StarDictArticleComparer _keyComparer = new StarDictArticleComparer(); - - public StarDictDictionary(string id, TranslationType translationType) : base(id, "StarDict", translationType) { } - - public override void Save(string dictDir) - { - if (string.IsNullOrWhiteSpace(dictDir)) - { - throw new ArgumentNullException(nameof(dictDir)); - } - - SaveDataFiles(dictDir, out long idxFileSize, out int synWordCount); - - SaveIfoFile(dictDir, idxFileSize, synWordCount); - } - - private void SaveDataFiles(string dictDir, out long idxFileSize, out int synWordCount) - { - string dictFile = Path.Combine(dictDir, $"{ID}.{TranslationType}.StarDict.dict"); - string idxFile = Path.Combine(dictDir, $"{ID}.{TranslationType}.StarDict.idx"); - string synFile = Path.Combine(dictDir, $"{ID}.{TranslationType}.StarDict.syn"); - - BinaryWriter dictWriter = new BinaryWriter(new FileStream(dictFile, FileMode.Create), Encoding.UTF8); - BinaryWriter idxWriter = new BinaryWriter(new FileStream(idxFile, FileMode.Create), Encoding.UTF8); - BinaryWriter synWriter = new BinaryWriter(new FileStream(synFile, FileMode.Create), Encoding.UTF8); - - Dictionary articleIndexes = new Dictionary(); - - uint index = 0; - foreach (OutputArticle article in Articles.OrderBy(a => a.StarDictKey, _keyComparer)) - { - long dictArticleOffset = dictWriter.BaseStream.Length; - - idxWriter.Write(article.StarDictKey.ToCharArray()); - idxWriter.Write('\0'); - - dictWriter.Write(article.StarDictValue.ToCharArray()); - - dictWriter.Flush(); - - long dictArticleLength = dictWriter.BaseStream.Length - dictArticleOffset; - - WriteBigEndian(idxWriter, (uint)dictArticleOffset); - WriteBigEndian(idxWriter, (uint)dictArticleLength); - - idxWriter.Flush(); - - articleIndexes[article] = index; - index++; - } - - dictWriter.Flush(); - dictWriter.Close(); - - idxWriter.Flush(); - idxFileSize = idxWriter.BaseStream.Length; - idxWriter.Close(); - - List> synonyms = new List>(); - - foreach (KeyValuePair articleIndex in articleIndexes) - { - uint keyIndex = articleIndex.Value; - foreach (string synonym in articleIndex.Key.StarDictKeySynonyms) - { - synonyms.Add(new KeyValuePair(synonym, keyIndex)); - } - } - - foreach (KeyValuePair synonym in synonyms.OrderBy(kvp => kvp.Key, _keyComparer)) - { - synWriter.Write(synonym.Key.ToCharArray()); - synWriter.Write('\0'); - - WriteBigEndian(synWriter, synonym.Value); - } - - synWordCount = synonyms.Count; - - synWriter.Flush(); - synWriter.Close(); - } - - private void SaveIfoFile(string dictDir, long idxFileSize, int synWordCount) - { - string ifoFile = Path.Combine(dictDir, $"{ID}.{TranslationType}.StarDict.ifo"); - - using BinaryWriter ifoWriter = new BinaryWriter(new FileStream(ifoFile, FileMode.Create), Encoding.UTF8); - - WriteLine(ifoWriter, "StarDict's dict ifo file"); - WriteLine(ifoWriter, "version=2.4.2"); - - WriteLine(ifoWriter, "bookname={0}", Title); - WriteLine(ifoWriter, "wordcount={0}", Articles.Count); - WriteLine(ifoWriter, "synwordcount={0}", synWordCount); - WriteLine(ifoWriter, "idxfilesize={0}", idxFileSize); - WriteLine(ifoWriter, "sametypesequence=h"); - - WriteLine(ifoWriter, "author={0}", string.Join(", ", Authors)); - WriteLine(ifoWriter, "description={0}", Description); - WriteLine(ifoWriter, "date={0}", CreationDateTime.ToString("yyyy.MM.dd")); - } - - private static void WriteLine(BinaryWriter bw, string line, params object[] args) - { - bw.Write(string.Format(line, args).ToCharArray()); - bw.Write('\r'); - bw.Write('\n'); - } - - private static void WriteBigEndian(BinaryWriter bw, uint value) - { - byte[] bytes = BitConverter.GetBytes(value); - - if (BitConverter.IsLittleEndian) - { - Array.Reverse(bytes); - } - - bw.Write(bytes); - } - - private class StarDictArticleComparer : IComparer - { - public int Compare(string x, string y) - { - int result = AsciiStrCmp(x, y); - return result == 0 ? StrCmp(x, y) : result; - } - - private static int AsciiStrCmp(string x, string y) - { - int[] bx = Encoding.UTF8.GetBytes(x).Select(b => (int)(b)).ToArray(); - int[] by = Encoding.UTF8.GetBytes(y).Select(b => (int)(b)).ToArray(); - - int minLength = Math.Min(bx.Length, by.Length); - - for (int i = 0; i < minLength; i++) - { - int cx = AsciiLower(bx[i]); - int cy = AsciiLower(by[i]); - - if (cx != cy) - { - return cx - cy; - } - } - - return bx.Length - by.Length; - } - - private static int AsciiLower(int c) - { - if (c >= 'A' && c <= 'Z') - { - return (c - 'A' + 'a'); - } - return c; - } - - private static int StrCmp(string x, string y) - { - int[] bx = Encoding.UTF8.GetBytes(x).Select(b => (int)(b)).ToArray(); - int[] by = Encoding.UTF8.GetBytes(y).Select(b => (int)(b)).ToArray(); - - int minLength = Math.Min(bx.Length, by.Length); - - for (int i = 0; i < minLength; i++) - { - int cx = bx[i]; - int cy = by[i]; - - if (cx != cy) - { - return cx - cy; - } - } - - return bx.Length - by.Length; - } - } - } -} \ No newline at end of file diff --git a/src/HawDict/Output/XdxfDictionary.cs b/src/HawDict/Output/XdxfDictionary.cs deleted file mode 100644 index 9b0e7fc..0000000 --- a/src/HawDict/Output/XdxfDictionary.cs +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright (c) Jon Thysell -// Licensed under the MIT License. - -using System; -using System.IO; -using System.Text; -using System.Xml; - -namespace HawDict -{ - public class XdxfDictionary : OutputDictBase - { - #region MetaData - - public string FullTitle { get; set; } = null; - - public string SrcUrl { get; set; } = null; - - #endregion - - public XdxfDictionary(string id, TranslationType translationType) : base(id, "XDXF", translationType) { } - - public override void Save(string dictDir) - { - if (string.IsNullOrWhiteSpace(dictDir)) - { - throw new ArgumentNullException(nameof(dictDir)); - } - - string xdxfFile = Path.Combine(dictDir, $"{ID}.{TranslationType}.dict.xdxf"); - - using FileStream fs = new FileStream(xdxfFile, FileMode.Create); - - SaveDictFile(fs); - } - - private void SaveDictFile(Stream output) - { - // Write to StringBuilder - StringBuilder sb = new StringBuilder(); - using (XmlWriter xw = XmlWriter.Create(sb, new XmlWriterSettings() { Encoding = Encoding.UTF8, CloseOutput = false })) - { - xw.WriteStartDocument(); - - xw.WriteStartElement("xdxf"); - - xw.WriteAttributeString("format", "logical"); - xw.WriteAttributeString("revision", "33"); - xw.WriteAttributeString("lang_from", TranslationType == TranslationType.HawToEng ? "HAW" : "ENG"); - xw.WriteAttributeString("lang_to", TranslationType == TranslationType.HawToEng ? "ENG" : "HAW"); - - WriteMetaInfoElements(xw); - - WriteArticles(xw); - - xw.WriteEndElement(); // xdxf - - xw.WriteEndDocument(); - } - - // Load from StringBuilder - XmlDocument doc = new XmlDocument(); - doc.LoadXml(sb.ToString()); - - // Write to stream - using (XmlWriter xw = XmlWriter.Create(output, new XmlWriterSettings() { Encoding = Encoding.UTF8, Indent = true, CloseOutput = false })) - { - doc.Save(xw); - } - } - - private void WriteMetaInfoElements(XmlWriter xw) - { - xw.WriteStartElement("meta_info"); - - xw.WriteElementString("title", Title); - - xw.WriteElementString("full_title", FullTitle); - - if (Authors.Count > 0) - { - xw.WriteStartElement("authors"); - - foreach (string author in Authors) - { - WriteElementStringIfNotNull(xw, "author", author); - } - - xw.WriteEndElement(); // authors - } - - xw.WriteElementString("description", Description); - - if (Abbreviations.Count > 0) - { - xw.WriteStartElement("abbreviations"); - - foreach (OutputAbbreviation abbreviation in Abbreviations) - { - xw.WriteStartElement("abbr_def"); - - switch (abbreviation.AbbreviationType) - { - case AbbreviationType.Grammatical: - xw.WriteAttributeString("type", "grm"); - break; - case AbbreviationType.Stylistic: - xw.WriteAttributeString("type", "stl"); - break; - case AbbreviationType.Knowledge: - xw.WriteAttributeString("type", "knl"); - break; - case AbbreviationType.Auxiliary: - xw.WriteAttributeString("type", "aux"); - break; - case AbbreviationType.Other: - xw.WriteAttributeString("type", "oth"); - break; - } - - xw.WriteRaw(abbreviation.XdxfKey); - xw.WriteRaw(abbreviation.XdxfValue); - - xw.WriteEndElement(); // abbr_def - } - - xw.WriteEndElement(); // abbreviations - } - - xw.WriteElementString("file_ver", FileVersion); - - xw.WriteElementString("creation_date", CreationDateTime.Date.ToString("dd-MM-yyyy")); - - WriteElementStringIfNotNull(xw, "dict_src_url", SrcUrl); - - xw.WriteEndElement(); // meta_info - } - - private void WriteArticles(XmlWriter xw) - { - if (Articles.Count > 0) - { - xw.WriteStartElement("lexicon"); - - foreach (OutputArticle article in Articles) - { - xw.WriteStartElement("ar"); - - xw.WriteRaw(article.XdxfKey); - xw.WriteRaw(article.XdxfValue); - - xw.WriteEndElement(); // ar - } - - xw.WriteEndElement(); // lexicon - } - } - - private static void WriteElementStringIfNotNull(XmlWriter xw, string localName, string value) - { - if (!string.IsNullOrWhiteSpace(value)) - { - xw.WriteElementString(localName, value); - } - } - } -} diff --git a/src/HawDict/StringUtils.cs b/src/HawDict/StringUtils.cs index b3f3c0a..ffdd7f9 100644 --- a/src/HawDict/StringUtils.cs +++ b/src/HawDict/StringUtils.cs @@ -141,54 +141,6 @@ public static string FixSentenceEnd(string s) private static readonly Regex ListSplitterRegex = new Regex(@"([^\(][a-zāēīōū])([,;:])(ʻ?[a-zA-ZāēīōūĀĒĪŌŪʻ][^\)])", RegexOptions.Compiled); private static readonly Regex SentenceSplitterRegex = new Regex(@"([a-zāēīōū])([\.\!\?])(ʻ?[A-ZĀĒĪŌŪ])", RegexOptions.Compiled); - public static string EscapeForXml(string s) - { - if (string.IsNullOrWhiteSpace(s)) - { - throw new ArgumentNullException(nameof(s)); - } - - return s - .Replace("&", "&") - .Replace("<", "<") - .Replace(">", ">").Trim(); - } - - public static string WrapInTag(string s, string tag) - { - if (string.IsNullOrWhiteSpace(s)) - { - throw new ArgumentNullException(nameof(s)); - } - - if (string.IsNullOrWhiteSpace(tag)) - { - throw new ArgumentNullException(nameof(tag)); - } - - return $"<{tag}>{s}"; - } - - public static string WrapInTag(string s, string target, string tag) - { - if (string.IsNullOrWhiteSpace(s)) - { - throw new ArgumentNullException(nameof(s)); - } - - if (string.IsNullOrWhiteSpace(target)) - { - throw new ArgumentNullException(nameof(target)); - } - - if (string.IsNullOrWhiteSpace(tag)) - { - throw new ArgumentNullException(nameof(tag)); - } - - return s.Replace(target, WrapInTag(target, tag)).Trim(); - } - public static string ReplaceOkina(string s, string replacement = "'") { if (string.IsNullOrWhiteSpace(s))