From 26420d3c7de7d98104205d52cdb1f096f41ea040 Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Tue, 18 Mar 2025 20:26:25 -0600 Subject: [PATCH 1/3] =?UTF-8?q?prefer=20endonyms=20over=20English=20names?= =?UTF-8?q?=20=E2=9D=8C=F0=9F=8F=B4=F3=A0=81=A7=F3=A0=81=A2=F3=A0=81=A5?= =?UTF-8?q?=F3=A0=81=AE=F3=A0=81=A7=F3=A0=81=BF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- py-src/build_locale_regions_json.py | 48 +- py-src/requirements.txt | 1 + .../index.integration.test.tsx.snap | 2 +- src/data/locale_regions.json | 15464 +++++++++------- 4 files changed, 8824 insertions(+), 6691 deletions(-) diff --git a/py-src/build_locale_regions_json.py b/py-src/build_locale_regions_json.py index fbd4595..77f7f19 100644 --- a/py-src/build_locale_regions_json.py +++ b/py-src/build_locale_regions_json.py @@ -7,13 +7,20 @@ import re from babel import Locale, languages, localedata +from requests import get int_locale = Locale('ia') locale_regions = {} unofficial_locale_regions = {} +international_locales = {} +# SIL provided data from CLDR repo that was not included in CLDR release +cldr_extra_resp = get('https://raw.githubusercontent.com/unicode-org/cldr/refs/heads/main/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/external/langtags.json') + + +# aux function to add a language to a given dict or add a region into an already added language def add_locale_to_dict(lang, region, locale_dict, fallback_name=None): if lang in locale_dict: if region not in locale_dict[lang]['regions']: @@ -29,6 +36,7 @@ def add_locale_to_dict(lang, region, locale_dict, fallback_name=None): locale_dict[lang]['name'] = fallback_name +# get locale data from babel library for locale in localedata.locale_identifiers(): locale_parts = locale.split('_') if len(locale_parts) > 1 and re.match(r'^[A-Z]{2}$', locale_parts[-1]): @@ -44,29 +52,45 @@ def add_locale_to_dict(lang, region, locale_dict, fallback_name=None): add_locale_to_dict(lang, region, unofficial_locale_regions) elif locale_parts[-1] == '001': # add United Nations for languages like Esperanto - add_locale_to_dict(locale_parts[0], 'UN', locale_regions) + add_locale_to_dict(locale_parts[0], 'UN', international_locales) + + +# get more languages from extra CLDR data not found in babel +if cldr_extra_resp.ok: + cldr_extra = cldr_extra_resp.json() + for locale in cldr_extra: + if 'iso639_3' not in locale: + continue + lang_id = locale['iso639_3'] + country_id = locale.get('region') + name = locale.get('localname') \ + or locale.get('localenames', [None])[0] \ + or locale.get('names', [None])[0] \ + or locale.get('name') + if not country_id or country_id.isdigit(): + # locales with "international" regions like 001 or 419 + add_locale_to_dict( + lang_id, 'UN', international_locales, fallback_name=name + ) + else: + add_locale_to_dict( + lang_id, country_id, unofficial_locale_regions, fallback_name=name + ) # merge back languages with no official region for lang in unofficial_locale_regions: if lang not in locale_regions: locale_regions[lang] = unofficial_locale_regions[lang] - - -# get more language codes and names from Ethnologue -with open('./data/LanguageCodes.tab') as ethnologue_file: - ethnologue_table = csv.reader(ethnologue_file, delimiter='\t') - next(ethnologue_table) # skip header row - for lang_id, country_id, _, english_name in ethnologue_table: - if lang_id not in locale_regions: - add_locale_to_dict( - lang_id, country_id, locale_regions, fallback_name=english_name - ) +for lang in international_locales: + if lang not in locale_regions: + locale_regions[lang] = international_locales[lang] # sort regions so each run returns same output for locale in locale_regions: locale_regions[locale]['regions'] = sorted(locale_regions[locale]['regions']) + with open('../src/data/locale_regions.json', 'w+') as f: json.dump(locale_regions, f, sort_keys=True, indent=2) diff --git a/py-src/requirements.txt b/py-src/requirements.txt index e42d9da..419b8b2 100644 --- a/py-src/requirements.txt +++ b/py-src/requirements.txt @@ -1,2 +1,3 @@ Babel>=2.16.0 pycodestyle==2.12.1 +requests diff --git a/src/__tests__/__snapshots__/index.integration.test.tsx.snap b/src/__tests__/__snapshots__/index.integration.test.tsx.snap index cdc8c55..07acbb2 100644 --- a/src/__tests__/__snapshots__/index.integration.test.tsx.snap +++ b/src/__tests__/__snapshots__/index.integration.test.tsx.snap @@ -16,7 +16,7 @@ exports[`LangSelecta does not change 1`] = `