-
Notifications
You must be signed in to change notification settings - Fork 143
Korean TN fixes: cardinal, decimal, fraction, date #374
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: ko_tn_staging_v1
Are you sure you want to change the base?
Changes from all commits
a108dce
4331512
aec257a
aebe1f2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,7 +26,4 @@ | |
| .uk 닷 유케이 | ||
| .br 닷 비알 | ||
| .in 닷 아이엔 | ||
| .ru 닷 알유 | ||
| .jpg 닷 제이피지 | ||
| .png 닷 피엔지 | ||
| .pdf 닷 피디에프 | ||
| .ru 닷 알유 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| .jpg 닷 제이피지 | ||
| .png 닷 피엔지 | ||
| .pdf 닷 피디에프 | ||
| .JPG 닷 제이피지 | ||
| .PNG 닷 피엔지 | ||
| .PDF 닷 피디에프 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,13 +16,19 @@ | |
| import pynini | ||
| from pynini.lib import pynutil | ||
|
|
||
| from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst | ||
| from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst | ||
| from nemo_text_processing.text_normalization.ko.utils import get_abs_path | ||
|
|
||
|
|
||
| class CardinalFst(GraphFst): | ||
| def __init__(self, deterministic: bool = True): | ||
| super().__init__(name="cardinal", kind="classify", deterministic=deterministic) | ||
|
|
||
| # Grouping separators to remove inside numbers (e.g., "1,234", "1’234", NBSP) | ||
| SEP = pynini.union(",", "’", "'", "\u00a0", "\u2009", "\u202f") | ||
| # Optional small whitespace inside parentheses or after signs | ||
| WS = pynini.closure(pynini.accep(" "), 0, 2) | ||
|
|
||
| # Load base .tsv files | ||
| graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) | ||
| graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) | ||
|
|
@@ -53,7 +59,9 @@ def __init__(self, deterministic: bool = True): | |
| graph_thousand = thousands @ graph_thousand_component | ||
|
|
||
| ten_thousands = NEMO_DIGIT**5 | ||
| graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit + pynutil.insert('만'))) + pynini.union( | ||
| graph_ten_thousand_component = ( | ||
| pynini.cross('1', '만') | (graph_digit_no_zero_one + pynutil.insert('만')) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add another level of parentheses, concat and union aren't reliable order of operations. |
||
| ) + pynini.union( | ||
| pynini.closure(pynutil.delete('0')), | ||
| graph_thousand_component, | ||
| (pynutil.delete('0') + graph_hundred_component), | ||
|
|
@@ -268,8 +276,38 @@ def __init__(self, deterministic: bool = True): | |
| ).optimize() | ||
|
|
||
| # Sign and final formatting | ||
| optional_sign = pynini.closure(pynutil.insert('negative: "true" ') + pynini.cross("-", ""), 0, 1) | ||
| final_graph = optional_sign + pynutil.insert('integer: "') + graph_num + pynutil.insert('"') | ||
| # Delete group separators when they appear between digits (e.g., "1,234" -> "1234") | ||
| delete_sep_between_digits = pynini.cdrewrite( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. checking: is there any occurence of European numbering in Korean text? |
||
| pynutil.delete(SEP), | ||
| NEMO_DIGIT, | ||
| NEMO_DIGIT, | ||
| NEMO_SIGMA, | ||
| ) | ||
|
|
||
| # Let the number graph accept numbers with separators | ||
| graph_num_accepting_separators = delete_sep_between_digits @ graph_num | ||
|
|
||
| # Build the integer token (integer: "...") | ||
| integer_token = pynutil.insert('integer: "') + graph_num_accepting_separators + pynutil.insert('"') | ||
|
|
||
| # Sign handling: | ||
| # - minus sets negative flag | ||
| # - plus is ignored (positive number) | ||
| minus_prefix = pynutil.insert('negative: "true" ') + pynutil.delete("-") | ||
| plus_prefix = pynutil.delete("+") | ||
|
|
||
| # Accounting negative: "( 1,234 )" -> negative + integer:"1234" | ||
| paren_negative = ( | ||
| pynutil.insert('negative: "true" ') + pynutil.delete("(") + WS + integer_token + WS + pynutil.delete(")") | ||
| ) | ||
|
|
||
| # Signed number: optional (+|-) + integer | ||
| signed_integer = (minus_prefix | plus_prefix).ques + integer_token | ||
|
|
||
| # Prefer accounting-form first, then signed form | ||
| final_graph = paren_negative | signed_integer | ||
|
|
||
| # Wrap with class tokens and finalize | ||
| final_graph = self.add_tokens(final_graph) | ||
| self.fst = final_graph.optimize() | ||
| self.graph = graph_num.optimize() | ||
| self.graph = graph_num_accepting_separators.optimize() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -121,11 +121,24 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): | |
| dollar_accep = pynini.accep("$") | ||
| excluded_symbols = DOT | dollar_accep | AT | ||
| filtered_symbols = pynini.difference(accepted_symbols, excluded_symbols) | ||
| accepted_characters = ASCII_ALNUM | filtered_symbols | ||
| # Domain core graph | ||
| graph_domain = (pynutil.insert('domain: "') + domain_core + pynutil.insert('"')).optimize() | ||
| graph |= graph_domain | ||
|
|
||
| known_extensions = pynini.project( | ||
| pynini.string_file(get_abs_path("data/electronic/extensions.tsv")), | ||
| "input", | ||
| ) | ||
|
|
||
| filename_stem = pynini.closure( | ||
| pynini.difference(NEMO_NOT_SPACE, pynini.union(SLASH, DOT)), | ||
| 1, | ||
| ) | ||
|
|
||
| file_with_extension = filename_stem + known_extensions | ||
|
|
||
| graph |= (pynutil.insert('domain: "') + file_with_extension + pynutil.insert('"')).optimize() | ||
|
|
||
| # (3) URL with protocol | ||
| graph |= protocol + insert_space + domain_graph_with_class_tags | ||
|
|
||
|
|
@@ -144,9 +157,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): | |
|
|
||
| four = pynini.closure(NEMO_DIGIT, 4, 4) | ||
| sep_token = pynini.union(HYPHEN, NEMO_SPACE) | ||
| sep_del = pynutil.delete(pynini.closure(sep_token, 1)) # allow mix of - or space | ||
|
|
||
| cc16_grouped = four + sep_del + four + sep_del + four + sep_del + four | ||
| sep_to_space = pynutil.delete(pynini.closure(sep_token, 0, 1)) + insert_space | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just do pynutil.delete(""").ques. Same thing |
||
| cc16_grouped = four + sep_to_space + four + sep_to_space + four + sep_to_space + four | ||
| cc16_grouped = cc16_grouped + delete_space | ||
|
|
||
| cc16_no_cue = ( | ||
| pynutil.insert('protocol: "신용카드 " ') | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,11 +24,11 @@ class TelephoneFst(GraphFst): | |
| Finite state transducer for classifying Korean telephone numbers. | ||
|
|
||
| Example inputs → tokens: | ||
| +82-10-3713-7050 -> telephone { country_code: "플러스 팔 이," number_part: "영일영, 삼칠일삼, 칠영오영" } | ||
| +1 (415) 555-0123 -> telephone { country_code: "플러스 일," number_part: "사일오, 오오오, 영일이삼" } | ||
| (031)371-3700 -> telephone { number_part: "영삼일, 삼칠일, 삼칠영영" } | ||
| 010-3713-7050 -> telephone { number_part: "영일영, 삼칠일삼, 칠영오영" } | ||
| 010.777.8888 -> telephone { number_part: "영일영, 칠칠칠, 팔팔팔팔" } | ||
| +82 010-3713-7050 -> telephone { country_code: "국가번호 팔이," number_part: "영일영 삼칠일삼 칠영오영" } | ||
| +1 (415) 555-0123 -> telephone { country_code: "국가번호 일," number_part: "사일오 오오오 영일이삼" } | ||
| (031)371-3700 -> telephone { number_part: "영삼일 삼칠일 삼칠영영" } | ||
| 010-3713-7050 -> telephone { number_part: "영일영 삼칠일삼 칠영오영" } | ||
| 010.777.8888 -> telephone { number_part: "영일영 칠칠칠 팔팔팔팔" } | ||
|
|
||
| Args: | ||
| deterministic (bool, optional): If True, provide a single transduction; | ||
|
|
@@ -37,8 +37,10 @@ class TelephoneFst(GraphFst): | |
|
|
||
| def __init__(self, deterministic: bool = True): | ||
| super().__init__(name="telephone", kind="classify", deterministic=deterministic) | ||
|
|
||
| add_sep = pynutil.insert(", ") # standard block separator ", " | ||
| # Separator between digit blocks (e.g., "-" or ".") | ||
| add_sep = pynutil.delete("-") | pynutil.delete(".") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. rename. Add_sep deleting is confusing |
||
| # Optional space inserted between blocks | ||
| sep_space = insert_space | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this isn't a constructive operation |
||
|
|
||
| # 1) safe digit mapping: force 0 -> "영" (do not rely on zero.tsv invert) | ||
| digit = pynini.string_file(get_abs_path("data/number/digit.tsv")).optimize() | ||
|
|
@@ -49,35 +51,39 @@ def __init__(self, deterministic: bool = True): | |
| four_digits = digit_ko**4 | ||
|
|
||
| # country code: "+1", "+82", "+1-" | ||
| country_core = ( | ||
| pynini.cross("+", "플러스 ") | ||
| + pynini.closure(digit_ko + insert_space, 0, 2) | ||
| + digit_ko | ||
| + pynutil.insert(",") | ||
| cc_digits = pynini.closure(digit_ko, 1, 3) | ||
|
|
||
| country_code = ( | ||
| pynutil.delete("+") | ||
| + pynutil.insert('country_code: "') | ||
| + cc_digits | ||
| + pynutil.insert('"') | ||
| + pynini.closure(pynutil.delete("-") | pynutil.delete(" "), 0, 1) | ||
| + delete_space | ||
| ) | ||
| country_code = pynutil.insert('country_code: "') + country_core + pynutil.insert('"') | ||
| country_code = country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space | ||
|
|
||
| # area part: "123-" | "123." | "(123)" [space?] or "(123)-" | ||
| area_core = three_digits | ||
| area_part = ( | ||
| (area_core + (pynutil.delete("-") | pynutil.delete("."))) | ||
| (area_core + add_sep) | ||
| | ( | ||
| pynutil.delete("(") | ||
| + area_core | ||
| + ((pynutil.delete(")") + pynini.closure(pynutil.delete(" "), 0, 1)) | pynutil.delete(")-")) | ||
| + pynutil.delete(")") | ||
| + pynini.closure(pynutil.delete(" "), 0, 1) | ||
| + pynini.closure(add_sep, 0, 1) | ||
| ) | ||
| ) + add_sep | ||
| ) + sep_space | ||
|
|
||
| # 2) allow 3 **or 4** digits in the middle block (to support 010-3713-7050) | ||
| mid = pynini.union(three_digits, four_digits) | ||
| last4 = four_digits | ||
|
|
||
| # consume '-' or '.' between middle and last blocks | ||
| number_part_core = area_part + mid + (pynutil.delete("-") | pynutil.delete(".")) + add_sep + last4 | ||
| number_part_core = area_part + mid + add_sep + sep_space + last4 | ||
| number_part = pynutil.insert('number_part: "') + number_part_core + pynutil.insert('"') | ||
|
|
||
| # final graph: with or without country code | ||
| graph = pynini.union(country_code + number_part, number_part).optimize() | ||
| graph = pynini.union(country_code + insert_space + number_part, number_part).optimize() | ||
|
|
||
| self.fst = self.add_tokens(graph).optimize() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,7 +17,13 @@ | |
| import pynini | ||
| from pynini.lib import pynutil | ||
|
|
||
| from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, generator_main | ||
| from nemo_text_processing.text_normalization.ko.graph_utils import ( | ||
| NEMO_WHITE_SPACE, | ||
| GraphFst, | ||
| delete_extra_space, | ||
| delete_space, | ||
| generator_main, | ||
| ) | ||
| from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst | ||
| from nemo_text_processing.text_normalization.ko.taggers.date import DateFst | ||
| from nemo_text_processing.text_normalization.ko.taggers.decimal import DecimalFst | ||
|
|
@@ -98,9 +104,14 @@ def __init__( | |
| ) | ||
|
|
||
| token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") | ||
| tagger = pynini.closure(token, 1) | ||
| space = pynini.closure(NEMO_WHITE_SPACE, 1) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ? Just use NEMO_WHITE_SPACE |
||
| space = pynini.compose(space, delete_extra_space) | ||
|
|
||
| self.fst = tagger.optimize() | ||
| space_opt = pynini.closure(space, 0, 1) | ||
|
|
||
| graph = delete_space + token + pynini.closure(space_opt + token) + delete_space | ||
|
|
||
| self.fst = graph.optimize() | ||
|
|
||
| if far_file: | ||
| generator_main(far_file, {"tokenize_and_classify": self.fst}) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sure you don't want to use NEMO_SPACE