From d902cbfdaf772683479d694b83c6d2ed253dcb06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20=C4=8Ciha=C5=99?= Date: Mon, 13 Apr 2026 13:04:22 +0200 Subject: [PATCH] feat: incorporate population data from wikidata This fills in the gaps that CLDR has. --- Makefile | 2 +- README.rst | 14 +++ languages-po/tg.po | 17 ++- population-fallback.csv | 78 +++++++++++++ population.csv | 80 ++++++++++++- scripts/export-cldr-population.py | 16 +++ scripts/export-wikidata-population.py | 105 +++++++++++++++++ weblate_language_data/population.py | 158 +++++++++++++------------- 8 files changed, 377 insertions(+), 93 deletions(-) create mode 100644 population-fallback.csv create mode 100755 scripts/export-wikidata-population.py diff --git a/Makefile b/Makefile index 5da3c43d5..fe214ef7a 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ aliases.csv: scripts/export-iso-aliases.py modules/iso-codes/data/iso_639-2.json ./scripts/export-iso-aliases.py @touch $@ -population.csv: modules/cldr-json/cldr-json/cldr-core/supplemental/territoryInfo.json scripts/export-cldr-population.py +population.csv: modules/cldr-json/cldr-json/cldr-core/supplemental/territoryInfo.json population-fallback.csv scripts/export-cldr-population.py ./scripts/export-cldr-population.py languages.csv: modules/iso-codes/data/iso_639-2.json scripts/export-iso-languages.py scripts/add-iso-population.py aliases.csv population.csv diff --git a/README.rst b/README.rst index 3bd9be483..69c3cbc96 100644 --- a/README.rst +++ b/README.rst @@ -95,6 +95,20 @@ translate.csv * Extracted from `translate-toolkit`_ * Generated using export-translate +population.csv +-------------- + +* Based on CLDR territory population data +* Supplemented by `population-fallback.csv` for base language codes missing or zero in CLDR +* Generated using `scripts/export-cldr-population.py` + +population-fallback.csv +----------------------- + +* Reviewed fallback speaker counts for language codes missing or zero in CLDR +* Currently sourced from Wikidata speaker-count statements (CC0) +* Includes source URLs and dates to keep the imported values auditable + languages-po ------------ diff --git a/languages-po/tg.po b/languages-po/tg.po index 7d6696566..47488d4f2 100644 --- a/languages-po/tg.po +++ b/languages-po/tg.po @@ -41,9 +41,6 @@ msgstr "немисии австриягӣ" msgid "Azerbaijani" msgstr "озарбойҷонӣ" -msgid "Azeri" -msgstr "озарбойҷонӣ" - msgid "Balinese" msgstr "балинӣ" @@ -132,7 +129,7 @@ msgid "Dzongkha" msgstr "дзонгха" msgid "English" -msgstr "Англисӣ" +msgstr "англисӣ" msgid "Esperanto" msgstr "эсперанто" @@ -155,6 +152,9 @@ msgstr "филиппинӣ" msgid "Finnish" msgstr "финӣ" +msgid "Flemish" +msgstr "Фламандӣ" + msgid "French" msgstr "франсузӣ" @@ -257,12 +257,6 @@ msgstr "кореягӣ" msgid "Kurdish" msgstr "курдӣ" -msgid "Kurdish, Central" -msgstr "курдии марказӣ" - -msgid "Kurdish, Sorani" -msgstr "курдии марказӣ" - msgid "Kurukh" msgstr "курукс" @@ -326,6 +320,9 @@ msgstr "менде" msgid "Mexican Spanish" msgstr "испании мексикоӣ" +msgid "Modern Standard Arabic" +msgstr "Стандарти муосири арабӣ" + msgid "Mohawk" msgstr "моҳок" diff --git a/population-fallback.csv b/population-fallback.csv new file mode 100644 index 000000000..63bb4ef1c --- /dev/null +++ b/population-fallback.csv @@ -0,0 +1,78 @@ +code,population,source,date,url +agr,56584,wikidata,2017-01-01,http://www.wikidata.org/entity/Q1526530 +aii,232300,wikidata,,http://www.wikidata.org/entity/Q29440 +ain,10,wikidata,2007-01-01,http://www.wikidata.org/entity/Q27969 +ale,200,wikidata,2010-01-01,http://www.wikidata.org/entity/Q27210 +ami,215000,wikidata,2020-01-01,http://www.wikidata.org/entity/Q35132 +anp,740000,wikidata,,http://www.wikidata.org/entity/Q28378 +arp,1000,wikidata,,http://www.wikidata.org/entity/Q56417 +arw,2500,wikidata,1980-01-01,http://www.wikidata.org/entity/Q2655664 +avk,50,wikidata,,http://www.wikidata.org/entity/Q1377116 +azb,14640650,wikidata,2021-01-01,http://www.wikidata.org/entity/Q3449805 +ber,17000000,wikidata,,http://www.wikidata.org/entity/Q25448 +bh,63000000,wikidata,,http://www.wikidata.org/entity/Q135305 +car,10000,wikidata,,http://www.wikidata.org/entity/Q56611 +chn,1,wikidata,,http://www.wikidata.org/entity/Q35173 +chy,2400,wikidata,2010-01-01,http://www.wikidata.org/entity/Q33265 +cnr,232600,wikidata,,http://www.wikidata.org/entity/Q8821 +din,1365900,wikidata,,http://www.wikidata.org/entity/Q56466 +dru,10500,wikidata,2002-01-01,http://www.wikidata.org/entity/Q49232 +fat,2800000,wikidata,,http://www.wikidata.org/entity/Q35570 +gom,3633900,wikidata,2000-01-01,http://www.wikidata.org/entity/Q5575236 +gug,4850000,wikidata,1995-01-01,http://www.wikidata.org/entity/Q17478066 +gum,20782,wikidata,,http://www.wikidata.org/entity/Q2744745 +guw,1539000,wikidata,2021-01-01,http://www.wikidata.org/entity/Q3111668 +hai,90,wikidata,2021-01-01,http://www.wikidata.org/entity/Q33303 +hmn,4500000,wikidata,2015-01-01,http://www.wikidata.org/entity/Q3307894 +hup,8,wikidata,,http://www.wikidata.org/entity/Q28058 +hus,150000,wikidata,,http://www.wikidata.org/entity/Q35573 +io,150,wikidata,2000-01-01,http://www.wikidata.org/entity/Q35224 +isv,7000,wikidata,2020-01-01,http://www.wikidata.org/entity/Q148971 +jpr,60000,wikidata,,http://www.wikidata.org/entity/Q33367 +kmr,14600000,wikidata,2019-01-01,http://www.wikidata.org/entity/Q36163 +kr,15000000,wikidata,,http://www.wikidata.org/entity/Q36094 +kut,100,wikidata,2011-01-01,http://www.wikidata.org/entity/Q33434 +lam,219000,wikidata,,http://www.wikidata.org/entity/Q36098 +lfn,200,wikidata,,http://www.wikidata.org/entity/Q146803 +lus,500000,wikidata,1997-01-01,http://www.wikidata.org/entity/Q36147 +mhr,482000,wikidata,2012-01-01,http://www.wikidata.org/entity/Q3906614 +miq,180000,wikidata,,http://www.wikidata.org/entity/Q1516803 +mnc,20,wikidata,2007-01-01,http://www.wikidata.org/entity/Q33638 +mnk,1350000,wikidata,,http://www.wikidata.org/entity/Q33678 +mrh,400000,wikidata,,http://www.wikidata.org/entity/Q4175893 +mwl,15000,wikidata,2000-01-01,http://www.wikidata.org/entity/Q13330 +nah,1925620,wikidata,2015-01-01,http://www.wikidata.org/entity/Q13300 +nhn,40000,wikidata,1980-01-01,http://www.wikidata.org/entity/Q6047309 +nia,825768,wikidata,,http://www.wikidata.org/entity/Q2407831 +nog,87119,wikidata,2010-01-01,http://www.wikidata.org/entity/Q33871 +ovd,2000,wikidata,2009-01-01,http://www.wikidata.org/entity/Q254950 +pbb,140000,wikidata,,http://www.wikidata.org/entity/Q33677 +prs,9600000,wikidata,2011-01-01,http://www.wikidata.org/entity/Q178440 +pwn,96000,wikidata,,http://www.wikidata.org/entity/Q715755 +rap,5000,wikidata,,http://www.wikidata.org/entity/Q36746 +rar,17500,wikidata,2018-01-01,http://www.wikidata.org/entity/Q36745 +rom,46000000,wikidata,,http://www.wikidata.org/entity/Q13201 +rup,112960,wikidata,1999-01-01,http://www.wikidata.org/entity/Q29316 +sad,40000,wikidata,,http://www.wikidata.org/entity/Q34016 +sel,600,wikidata,2022-01-01,http://www.wikidata.org/entity/Q34008 +sgs,500000,wikidata,2009-01-01,http://www.wikidata.org/entity/Q213434 +sjd,500,wikidata,,http://www.wikidata.org/entity/Q33656 +smi,25000,wikidata,,http://www.wikidata.org/entity/Q56463 +sxu,2000000,wikidata,1998-01-01,http://www.wikidata.org/entity/Q699284 +syc,575000,wikidata,,http://www.wikidata.org/entity/Q33538 +szy,958,wikidata,2018-01-01,http://www.wikidata.org/entity/Q718269 +tay,80000,wikidata,,http://www.wikidata.org/entity/Q715766 +ter,16000,wikidata,2006-01-01,http://www.wikidata.org/entity/Q3314742 +tl,23600000,wikidata,2019-01-01,http://www.wikidata.org/entity/Q34057 +tlh,9000,wikidata,2016-01-01,http://www.wikidata.org/entity/Q10134 +tli,500,wikidata,2016-01-01,http://www.wikidata.org/entity/Q27792 +tok,1696,wikidata,2022-01-01,http://www.wikidata.org/entity/Q36846 +tw,3000000,wikidata,,http://www.wikidata.org/entity/Q36850 +vo,20,wikidata,2000-01-01,http://www.wikidata.org/entity/Q36986 +vot,62,wikidata,2010-01-01,http://www.wikidata.org/entity/Q32858 +was,20,wikidata,2011-01-01,http://www.wikidata.org/entity/Q34198 +wen,60000,wikidata,,http://www.wikidata.org/entity/Q25442 +xal,431800,wikidata,2021-01-01,http://www.wikidata.org/entity/Q33634 +zap,777000,wikidata,,http://www.wikidata.org/entity/Q13214 +zen,200,wikidata,2018-01-01,http://www.wikidata.org/entity/Q37005 +zun,9650,wikidata,,http://www.wikidata.org/entity/Q10188 diff --git a/population.csv b/population.csv index 09c477482..2877d06c9 100644 --- a/population.csv +++ b/population.csv @@ -31,10 +31,14 @@ af_NA,2102745 af_ZA,7857538 agq,43352 agq_CM,43352 +agr,56584 agu,47463 agu_GT,47463 +aii,232300 +ain,10 ak,13489749 ak_GH,13489749 +ale,200 aln,1463046 aln_XK,1463046 alt,19714 @@ -42,12 +46,14 @@ alt_RU,19714 am,39176975 am_ET,39121500 am_IL,55475 +ami,215000 amo,20596 amo_NG,20596 an,24585 an_ES,24585 ann,0 ann_NG,0 +anp,740000 aoz,760217 aoz_ID,760217 apc,43682528 @@ -102,10 +108,12 @@ arn,279970 arn_CL,279970 aro,110 aro_BO,110 +arp,1000 arq,39028675 arq_DZ,39028675 ars,1096332 ars_SA,1096332 +arw,2500 ary,32527212 ary_MA,32527212 arz,71198080 @@ -120,6 +128,7 @@ atj,6983 atj_CA,6983 av,549201 av_RU,549201 +avk,50 awa,27458162 awa_IN,26773470 awa_NP,684692 @@ -137,6 +146,7 @@ az_Cyrl,1185333 az_Cyrl_AZ,1054369 az_Cyrl_RU,130963 az_TR,622484 +azb,14640650 ba,1830673 ba_RU,1830673 bal,8751140 @@ -171,6 +181,7 @@ bej,2725234 bej_SD,2725234 bem,7071694 bem_ZM,7071694 +ber,17000000 bew,5912802 bew_ID,5912802 bez,1146855 @@ -200,6 +211,7 @@ bgn_IR,494966 bgn_PK,1438474 bgx,386949 bgx_TR,386949 +bh,63000000 bhb,1690955 bhb_IN,1690955 bhi,1296399 @@ -315,6 +327,7 @@ cad,0 cad_US,0 cak,1606457 cak_GT,1606457 +car,10000 cch,49716 cch_NG,49716 ccp,765689 @@ -335,12 +348,14 @@ chk,29880 chk_FM,29880 chm,521037 chm_RU,521037 +chn,1 cho,11284 cho_US,11284 chp,12414 chp_CA,12414 chr,26331 chr_US,26331 +chy,2400 cic,0 cic_US,0 cja,273019 @@ -354,6 +369,7 @@ ckz,3285 ckz_GT,3285 clc,853 clc_CA,853 +cnr,232600 co,164099 co_FR,164099 cop,6674820 @@ -432,6 +448,7 @@ den,2327 den_CA,2327 dgr,2133 dgr_CA,2133 +din,1365900 dje,4478276 dje_NE,4478276 dnj,1628814 @@ -439,6 +456,7 @@ dnj_CI,1199272 dnj_LR,429542 doi,2818260 doi_IN,2818260 +dru,10500 dsb,6981 dsb_DE,6981 dtm,241896 @@ -706,6 +724,7 @@ fa_QA,280729 fa_TJ,81073 fan,915873 fan_GQ,915873 +fat,2800000 fbl,2720371 fbl_PH,2720371 ff,8955864 @@ -880,6 +899,7 @@ gn,6095531 gn_AR,22087 gn_BO,55404 gn_PY,6018040 +gom,3633900 gon,3381911 gon_IN,3381911 gor,1154404 @@ -909,8 +929,11 @@ gub,18484 gub_BR,18484 guc,133888 guc_CO,133888 +gug,4850000 +gum,20782 gur,1210618 gur_GH,1210618 +guw,1539000 guz,2854073 guz_KE,2854073 gv,1660 @@ -927,6 +950,7 @@ ha_Arab_SD,908411 ha_GH,297466 ha_NE,10800548 ha_NG,30777110 +hai,90 hak,32568920 hak_CN,32568920 hak_Hant,2595483 @@ -950,6 +974,7 @@ hif,390160 hif_FJ,390160 hil,9935268 hil_PH,9935268 +hmn,4500000 hnd,1034692 hnd_PK,1034692 hne,15500430 @@ -1001,8 +1026,10 @@ hu_RS,319306 hu_SI,9230 hu_SK,612001 hu_UA,131948 +hup,8 hur,1396 hur_CA,1396 +hus,150000 hy,5443504 hy_AM,2947002 hy_CY,2773 @@ -1038,10 +1065,11 @@ ilo,11354592 ilo_PH,11354592 inh,225313 inh_RU,225313 -io,0 +io,150 io_US,0 is,364036 is_IS,364036 +isv,7000 it,70475318 it_AT,807118 it_AU,508603 @@ -1084,6 +1112,7 @@ jmc,499219 jmc_TZ,499219 jml,995916 jml_NP,995916 +jpr,60000 jut,0 jut_DK,0 jv,96145857 @@ -1188,6 +1217,7 @@ km,15186693 km_KH,15186693 kmb,9300525 kmb_AO,9300525 +kmr,14600000 kn,52137810 kn_IN,52137810 knf,95020 @@ -1216,6 +1246,7 @@ kpe_GN,531475 kpe_LR,1087450 kqn,395182 kqn_ZM,395182 +kr,15000000 krc,239395 krc_RU,239395 kri,8664997 @@ -1256,6 +1287,7 @@ ku_SY,1909232 ku_TR,4626572 kum,281642 kum_RU,281642 +kut,100 kv,253477 kv_RU,253477 kvr,394186 @@ -1289,6 +1321,7 @@ lah_IN,35228 lah_PK,100945600 laj,1872754 laj_UG,1872754 +lam,219000 lb,449740 lb_LU,449740 lbe,109840 @@ -1308,6 +1341,7 @@ lep_IN,49319 lep_NP,2832 lez,253477 lez_RU,253477 +lfn,200 lg,6406790 lg_UG,6406790 li,977482 @@ -1366,6 +1400,7 @@ lun,395182 lun_ZM,395182 luo,5708147 luo_KE,5708147 +lus,500000 luy,6407104 luy_KE,6407104 luz,1060642 @@ -1444,12 +1479,14 @@ mh,59868 mh_MH,59868 mhn,1402 mhn_IT,1402 +mhr,482000 mi,144513 mi_NZ,144513 mic,9310 mic_CA,9310 min,8446860 min_ID,8446860 +miq,180000 mk,1612846 mk_AL,14603 mk_GR,167377 @@ -1469,9 +1506,11 @@ mn_MN,3051962 mn_Mong,3681704 mn_Mong_CN,3681704 mn_RU,2112 +mnc,20 mni,1568599 mni_BD,18556 mni_IN,1550043 +mnk,1350000 mnw,981772 mnw_MM,862906 mnw_TH,118865 @@ -1488,6 +1527,7 @@ mr,98639100 mr_IN,98639100 mrd,258315 mrd_NP,258315 +mrh,400000 mrj,29572 mrj_RU,29572 mro,30365 @@ -1513,6 +1553,7 @@ mvy,353309 mvy_PK,353309 mwk,1099530 mwk_ML,1099530 +mwl,15000 mwr,16909560 mwr_IN,16909560 mwv,67574 @@ -1539,6 +1580,7 @@ mzn,4419345 mzn_IR,4419345 na,6924 na_NR,6924 +nah,1925620 nan,26904760 nan_CN,26904760 nan_Hant,13473164 @@ -1573,8 +1615,10 @@ ngl,2267868 ngl_MZ,2267868 nhe,509886 nhe_MX,509886 +nhn,40000 nhw,509886 nhw_MX,509886 +nia,825768 nij,1041779 nij_ID,1041779 niu,1120 @@ -1605,6 +1649,7 @@ nod,6712416 nod_TH,6712416 noe,1831869 noe_IN,1831869 +nog,87119 nqo,699310 nqo_GN,699310 nr,967081 @@ -1655,6 +1700,7 @@ os_GE,107821 os_RU,450627 osa,0 osa_US,0 +ovd,2000 pa,219603179 pa_CA,1008664 pa_GB,2464527 @@ -1672,6 +1718,7 @@ pap_BQ,16200 pap_CW,124164 pau,16179 pau_PW,16179 +pbb,140000 pcd,752120 pcd_FR,752120 pcm,49716870 @@ -1729,6 +1776,7 @@ prd_AF,481459 prd_IR,441934 prg,38 prg_PL,38 +prs,9600000 ps,58062880 ps_AE,290933 ps_AF,17252288 @@ -1753,6 +1801,7 @@ pt_ST,190026 pt_TL,889076 puu,220959 puu_GA,220959 +pwn,96000 qu,11942570 qu_BO,3939840 qu_EC,3112700 @@ -1767,6 +1816,8 @@ quv,9857 quv_GT,9857 raj,1409130 raj_IN,1409130 +rap,5000 +rar,17500 rcf,559184 rcf_RE,559184 rej,1295185 @@ -1815,6 +1866,7 @@ rob,309718 rob_ID,309718 rof,499219 rof_TZ,499219 +rom,46000000 rtm,2474 rtm_FJ,2474 ru,201151468 @@ -1846,6 +1898,7 @@ rue,427941 rue_UA,427941 rug,10175 rug_SB,10175 +rup,112960 rw,11963415 rw_CD,438531 rw_RW,10489941 @@ -1856,6 +1909,7 @@ ryu,948655 ryu_JP,948655 sa,16909 sa_IN,16909 +sad,40000 saf,4150 saf_GH,4150 sah,450627 @@ -1899,11 +1953,12 @@ seh,1534146 seh_MZ,1534146 sei,915 sei_MX,915 +sel,600 ses,747680 ses_ML,747680 sg,2768970 sg_CF,2768970 -sgs,0 +sgs,500000 sgs_LT,0 shi,3252721 shi_Latn,3252721 @@ -1916,6 +1971,7 @@ si,14948168 si_LK,14948168 sid,4149250 sid_ET,4149250 +sjd,500 sk,6809719 sk_CZ,1734064 sk_HU,11826 @@ -1938,6 +1994,7 @@ sm_TK,329 sm_WS,208853 sma,296 sma_SE,296 +smi,25000 smj,1482 smj_SE,1482 smn,618 @@ -2033,6 +2090,8 @@ swv,3945564 swv_IN,3945564 sxn,259037 sxn_ID,259037 +sxu,2000000 +syc,575000 syl,8434850 syl_BD,8434850 syr,230463 @@ -2040,6 +2099,7 @@ syr_IQ,210417 syr_SY,20046 szl,503701 szl_PL,503701 +szy,958 ta,90608136 ta_CA,252166 ta_GB,2190691 @@ -2051,6 +2111,7 @@ ta_RE,118137 ta_SG,126597 taj,133826 taj_NP,133826 +tay,80000 tbw,10053 tbw_PH,10053 tcy,2113695 @@ -2068,6 +2129,7 @@ tem_SL,2371473 teo,2353060 teo_KE,431023 teo_UG,1922037 +ter,16000 tet,889076 tet_TL,889076 tg,10394100 @@ -2100,6 +2162,9 @@ tkr,17040 tkr_AZ,17040 tkt,74693 tkt_NP,74693 +tl,23600000 +tlh,9000 +tli,500 tly,1043719 tly_AZ,1043719 tmh,2042370 @@ -2118,6 +2183,7 @@ tog,213280 tog_MW,213280 toi,2287901 toi_ZM,2287901 +tok,1696 tpi,7132802 tpi_PG,7132802 tr,82419542 @@ -2164,6 +2230,7 @@ tum_MW,1828117 tum_ZM,540776 tvl,9973 tvl_TV,9973 +tw,3000000 twq,7902 twq_NE,7902 ty,94097 @@ -2248,7 +2315,8 @@ vmf,5047146 vmf_DE,5047146 vmw,4335630 vmw_MZ,4335630 -vot,0 +vo,20 +vot,62 vot_RU,0 vro,68046 vro_EE,68046 @@ -2264,12 +2332,14 @@ wal_ET,2133900 war,3434688 war_CA,4655 war_PH,3430032 +was,20 wbp,2489 wbp_AU,2489 wbq,2536434 wbq_IN,2536434 wbr,2113695 wbr_IN,2113695 +wen,60000 wls,9418 wls_WF,9418 wni,306047 @@ -2281,6 +2351,7 @@ wtm,6481998 wtm_IN,6481998 wuu,84962400 wuu_CN,84962400 +xal,431800 xav,9902 xav_BR,9902 xh,10901720 @@ -2329,10 +2400,12 @@ za,4389724 za_CN,4389724 zag,257383 zag_SD,257383 +zap,777000 zdj,333052 zdj_KM,333052 zea,248813 zea_NL,248813 +zen,200 zgh,8225272 zgh_MA,8225272 zh_Hans,1286444445 @@ -2367,5 +2440,6 @@ zu_MW,71818 zu_MZ,1800 zu_SZ,77390 zu_ZA,14506224 +zun,9650 zza,1177672 zza_TR,1177672 diff --git a/scripts/export-cldr-population.py b/scripts/export-cldr-population.py index 45fe68e3f..9c143abe6 100755 --- a/scripts/export-cldr-population.py +++ b/scripts/export-cldr-population.py @@ -4,9 +4,11 @@ # # SPDX-License-Identifier: MIT +import csv import json from collections import defaultdict from collections.abc import Generator +from pathlib import Path MAPPING = { "zh": "zh_Hans", @@ -37,6 +39,16 @@ def get_region_countries(text: str) -> Generator[str]: for country in get_region_countries(code): REGION_COUNTRIES[country] = code + +def load_fallback_populations() -> dict[str, int]: + fallback_file = Path("population-fallback.csv") + if not fallback_file.exists(): + return {} + with fallback_file.open() as handle: + reader = csv.DictReader(handle) + return {row["code"]: int(row["population"]) for row in reader} + + with open( "modules/cldr-json/cldr-json/cldr-core/supplemental/territoryInfo.json", ) as handle: @@ -54,6 +66,10 @@ def get_region_countries(text: str) -> Generator[str]: if code in REGION_COUNTRIES and language in REGION_LANGUAGES: languages[f"{language}_{REGION_COUNTRIES[code]}"] += population * factor +for code, population in load_fallback_populations().items(): + if int(languages.get(code, 0)) == 0: + languages[code] = population + with open("population.csv", "w") as handle: handle.write("code,population\n") for code in sorted(languages): diff --git a/scripts/export-wikidata-population.py b/scripts/export-wikidata-population.py new file mode 100755 index 000000000..c4ad2ffc0 --- /dev/null +++ b/scripts/export-wikidata-population.py @@ -0,0 +1,105 @@ +#! /usr/bin/env python3 + +# Copyright © Michal Čihař +# +# SPDX-License-Identifier: MIT + +""" +Export fallback speaker counts for languages not covered by CLDR territory data. + +This is a maintenance helper and is intentionally not part of the default build. +It queries Wikidata and emits rows for base language codes that are missing or +zero in population.csv. The resulting CSV can be reviewed and committed as +population-fallback.csv. +""" + +from __future__ import annotations + +import csv +import io +import sys +import urllib.parse +import urllib.request +from collections import defaultdict + +QUERY = """ +SELECT ?item ?code ?tag ?speakers ?date WHERE { + ?item p:P1098 ?statement . + ?statement psv:P1098 ?speaker_value . + ?speaker_value wikibase:quantityAmount ?speakers . + OPTIONAL { ?statement pq:P585 ?date . } + OPTIONAL { ?item wdt:P220 ?code . } + OPTIONAL { ?item wdt:P305 ?tag . } + FILTER(BOUND(?code) || BOUND(?tag)) +} +""" + + +def load_zero_population_codes() -> list[str]: + with open("languages.csv") as csvfile: + reader = csv.reader(csvfile, delimiter=",") + next(reader) + languages = [ + row[0] for row in reader if "_" not in row[0] and "@" not in row[0] + ] + + with open("population.csv") as csvfile: + reader = csv.reader(csvfile, delimiter=",") + next(reader) + populations = dict(reader) + + return [code for code in languages if int(populations.get(code, "0")) == 0] + + +def fetch_wikidata_rows() -> list[dict[str, str]]: + params = urllib.parse.urlencode({"query": QUERY}) + request = urllib.request.Request( + f"https://query.wikidata.org/sparql?{params}", + headers={ + "Accept": "text/csv", + "User-Agent": "weblate-language-data population refresh", + }, + ) + with urllib.request.urlopen(request) as handle: + return list(csv.DictReader(io.TextIOWrapper(handle, encoding="utf-8"))) + + +def get_best_row(rows: list[dict[str, str]]) -> dict[str, str] | None: + positive = { + (row["item"], row["speakers"], row["date"]): row + for row in rows + if float(row["speakers"]) > 0 + }.values() + if not positive: + return None + return max(positive, key=lambda row: (row["date"] or "", float(row["speakers"]))) + + +def main() -> None: + rows_by_code: defaultdict[str, list[dict[str, str]]] = defaultdict(list) + rows_by_tag: defaultdict[str, list[dict[str, str]]] = defaultdict(list) + for row in fetch_wikidata_rows(): + if row["code"]: + rows_by_code[row["code"]].append(row) + if row["tag"]: + rows_by_tag[row["tag"]].append(row) + + writer = csv.writer(sys.stdout) + writer.writerow(["code", "population", "source", "date", "url"]) + for code in sorted(load_zero_population_codes()): + best_row = get_best_row(rows_by_code[code] + rows_by_tag[code]) + if best_row is None: + continue + writer.writerow( + [ + code, + int(float(best_row["speakers"])), + "wikidata", + best_row["date"][:10], + best_row["item"], + ] + ) + + +if __name__ == "__main__": + main() diff --git a/weblate_language_data/population.py b/weblate_language_data/population.py index 938bc432e..e63e66908 100644 --- a/weblate_language_data/population.py +++ b/weblate_language_data/population.py @@ -25,19 +25,19 @@ "aeb": 10843920, "af": 9966164, "afh": 0, - "aii": 0, - "ain": 0, - "agr": 0, + "aii": 232300, + "ain": 10, + "agr": 56584, "ak": 13489749, "akk": 0, - "ale": 0, + "ale": 200, "aln": 1463046, "alt": 19714, "am": 39176975, - "ami": 0, + "ami": 215000, "an": 24585, "ang": 0, - "anp": 0, + "anp": 740000, "apc": 43682528, "apd": 30785053, "ar": 378792526, @@ -53,21 +53,21 @@ "ar_Latn": 378792526, "arc": 0, "arn": 279970, - "arp": 0, + "arp": 1000, "arq": 39028675, "ars": 1096332, - "arw": 0, + "arw": 2500, "arz": 71198080, "as": 18318690, "asa": 809545, "ast": 614645, "av": 549201, - "avk": 0, + "avk": 50, "awa": 27458162, "ay": 2984003, "ayc": 0, "az": 10101162, - "azb": 0, + "azb": 14640650, "ba": 1830673, "bal": 8751140, "ban": 5068116, @@ -79,13 +79,13 @@ "be_Latn": 2991098, "bej": 2725234, "bem": 7071694, - "ber": 0, + "ber": 17000000, "bew": 5912802, "bez": 1146855, "bg": 7684020, "bgc": 16909560, "bgn": 2186207, - "bh": 0, + "bh": 63000000, "bhb": 1690955, "bhi": 1296399, "bho": 34639016, @@ -121,7 +121,7 @@ "ca_AD": 43538, "cad": 0, "cak": 1606457, - "car": 0, + "car": 10000, "cdo_Hans": 0, "cdo_Hant": 0, "cdo_Latn": 0, @@ -133,16 +133,16 @@ "chg": 0, "chk": 29880, "chm": 521037, - "chn": 0, + "chn": 1, "cho": 11284, "chp": 12414, "chr": 26331, - "chy": 0, + "chy": 2400, "ckb": 5606845, "ckb_IQ": 5050008, "ckb_IR": 556837, - "cnr": 0, - "cnr_Cyrl": 0, + "cnr": 232600, + "cnr_Cyrl": 232600, "co": 164099, "cop": 6674820, "cpe": 0, @@ -174,12 +174,12 @@ "del": 0, "den": 2327, "dgr": 2133, - "din": 0, + "din": 1365900, "dje": 4478276, "dnj": 1628814, "dnk": 0, "doi": 2818260, - "dru": 0, + "dru": 10500, "dry": 0, "dsb": 6981, "dua": 148637, @@ -241,7 +241,7 @@ "ext": 231673, "fa": 89208445, "fan": 915873, - "fat": 0, + "fat": 2800000, "fbl": 2720371, "ff": 8955864, "ffm": 1693276, @@ -286,7 +286,7 @@ "gmh": 0, "gn": 6095531, "goh": 0, - "gom": 0, + "gom": 3633900, "gon": 3381911, "gor": 1154404, "got": 0, @@ -296,16 +296,16 @@ "gu": 65626182, "gu_IN": 63410850, "guc": 133888, - "gug": 0, - "gum": 0, + "gug": 4850000, + "gum": 20782, "gun": 0, "gur": 1210618, - "guw": 0, + "guw": 1539000, "guz": 2854073, "gv": 1660, "gwi": 310, "ha": 41875124, - "hai": 0, + "hai": 90, "hak_Hans": 32568920, "hak_Hant": 2595483, "hak_Latn": 32568920, @@ -319,7 +319,7 @@ "hif": 390160, "hil": 9935268, "hit": 0, - "hmn": 0, + "hmn": 4500000, "hnd": 1034692, "hne": 15500430, "hnj": 815127, @@ -333,8 +333,8 @@ "hsn": 41065160, "ht": 9520659, "hu": 12313191, - "hup": 0, - "hus": 0, + "hup": 8, + "hus": 150000, "hy": 5443504, "hz": 255133, "ia": 136, @@ -347,9 +347,9 @@ "ik": 7865, "ilo": 11354592, "inh": 225313, - "io": 0, + "io": 150, "is": 364036, - "isv": 0, + "isv": 7000, "it": 70475318, "it@formal": 70475318, "it@informal": 70475318, @@ -362,7 +362,7 @@ "jbo": 0, "jgo": 105284, "jmc": 499219, - "jpr": 0, + "jpr": 60000, "jrb": 0, "jv": 96145857, "ka": 4324687, @@ -392,15 +392,15 @@ "kln": 4426726, "km": 15186693, "kmb": 9300525, - "kmr": 0, - "kmr_Latn": 0, + "kmr": 14600000, + "kmr_Latn": 14600000, "kn": 52137810, "ko": 79278717, "kok": 4509216, "kok_Latn": 4509216, "kos": 7968, "kpe": 1618925, - "kr": 0, + "kr": 15000000, "krc": 239395, "kri": 8664997, "krl": 115473, @@ -410,7 +410,7 @@ "ksh": 252357, "ku": 7214784, "kum": 281642, - "kut": 0, + "kut": 100, "kv": 253477, "kw": 1985, "kxm": 1188657, @@ -419,10 +419,10 @@ "lad": 122234, "lag": 586920, "laj": 1872754, - "lam": 0, + "lam": 219000, "lb": 449740, "lez": 253477, - "lfn": 0, + "lfn": 200, "lg": 6406790, "li": 977482, "lij": 524298, @@ -445,7 +445,7 @@ "lui": 0, "lun": 395182, "luo": 5708147, - "lus": 0, + "lus": 500000, "luy": 6407104, "luz": 1060642, "lv": 1098762, @@ -471,12 +471,12 @@ "mgh": 1500795, "mgo": 145540, "mh": 59868, - "mhr": 0, + "mhr": 482000, "mi": 144513, "mia": 0, "mic": 9310, "min": 8446860, - "miq": 0, + "miq": 180000, "mis": 0, "mjw": 0, "mk": 1612846, @@ -484,22 +484,22 @@ "mn": 3054074, "mn_Cyrl": 3054074, "mn_Mong": 3681704, - "mnc": 0, + "mnc": 20, "mni": 1568599, - "mnk": 0, + "mnk": 1350000, "mnw": 981772, "moe": 11638, "moh": 1590, "mos": 9216880, "mr": 98639100, - "mrh": 0, + "mrh": 400000, "ms": 36798159, "ms_Arab": 3403837, "mt": 469730, "mtr": 1366856, "mus": 4103, "mwk": 1099530, - "mwl": 0, + "mwl": 15000, "mwr": 16909560, "mww": 2179938, "mxc": 1114776, @@ -509,7 +509,7 @@ "myx": 1429207, "mzn": 4419345, "na": 6924, - "nah": 0, + "nah": 1925620, "nan_Hans": 26904760, "nan_Hant": 13473164, "nan_Hntl_pehoeji": 0, @@ -528,8 +528,8 @@ "new": 1027039, "ng": 588768, "ngl": 2267868, - "nhn": 0, - "nia": 0, + "nhn": 40000, + "nia": 825768, "nij": 1041779, "niu": 1120, "nl": 32854898, @@ -540,7 +540,7 @@ "nnh": 433525, "nod": 6712416, "noe": 1831869, - "nog": 0, + "nog": 87119, "non": 0, "nqo": 699310, "nr": 967081, @@ -563,7 +563,7 @@ "ota": 0, "otk": 0, "otq": 0, - "ovd": 0, + "ovd": 2000, "pa": 219603179, "pa_PK": 176654800, "pag": 1655877, @@ -571,7 +571,7 @@ "pam": 2720371, "pap": 216652, "pau": 16179, - "pbb": 0, + "pbb": 140000, "pcm": 49716870, "pdt": 42674, "peo": 0, @@ -584,7 +584,7 @@ "pon": 22908, "prg": 38, "pro": 0, - "prs": 0, + "prs": 9600000, "ps": 58062880, "pt": 249463918, "pt@formal": 249463918, @@ -594,7 +594,7 @@ "pt_BR@formal": 200247320, "pt_BR@informal": 200247320, "pt_PT": 9798912, - "pwn": 0, + "pwn": 96000, "qdt": 0, "qpv": 0, "qtp": 0, @@ -604,8 +604,8 @@ "qug": 1043670, "qya": 0, "raj": 1409130, - "rap": 0, - "rar": 0, + "rap": 5000, + "rar": 17500, "rcf": 559184, "rej": 1295185, "rhg": 1872054, @@ -618,17 +618,17 @@ "ro": 20043506, "ro_MD": 2267703, "rof": 499219, - "rom": 0, + "rom": 46000000, "ru": 201151468, "ru@formal": 201151468, "ru@informal": 201151468, "ru_UA": 16404428, "rue": 427941, - "rup": 0, + "rup": 112960, "rw": 11963415, "rwk": 148416, "sa": 16909, - "sad": 0, + "sad": 40000, "sah": 450627, "sai": 0, "sam": 0, @@ -646,23 +646,23 @@ "se": 52950, "sef": 1289217, "seh": 1534146, - "sel": 0, + "sel": 600, "ses": 747680, "sg": 2768970, "sga": 0, "sgn": 0, - "sgs": 0, + "sgs": 500000, "shi": 3252721, "shn": 3748858, "si": 14948168, "sid": 4149250, - "sjd": 0, + "sjd": 500, "sk": 6809719, "skr": 30283680, "sl": 1967012, "sm": 252638, "sma": 296, - "smi": 0, + "smi": 25000, "smj": 1482, "sml": 0, "smn": 618, @@ -696,20 +696,20 @@ "swb": 170720, "swg": 841191, "swv": 3945564, - "sxu": 0, - "syc": 0, + "sxu": 2000000, + "syc": 575000, "syl": 8434850, "syr": 230463, "szl": 503701, - "szy": 0, + "szy": 958, "ta": 90608136, "ta_LK": 3297390, - "tay": 0, + "tay": 80000, "tcy": 2113695, "te": 101457360, "tem": 2371473, "teo": 2353060, - "ter": 0, + "ter": 16000, "tet": 889076, "tg": 10394100, "th": 55936800, @@ -718,17 +718,17 @@ "tiv": 3787952, "tk": 7177805, "tkl": 1136, - "tl": 0, - "tlh": 0, + "tl": 23600000, + "tlh": 9000, "tlh_Piqd": 0, - "tli": 0, + "tli": 500, "tly": 1043719, "tmh": 2042370, "tn": 6529139, "to": 99644, "tog": 213280, "toi": 2287901, - "tok": 0, + "tok": 1696, "tpi": 7132802, "the": 0, "tr": 82419542, @@ -742,7 +742,7 @@ "tts": 16781040, "tum": 2368893, "tvl": 9973, - "tw": 0, + "tw": 3000000, "ty": 94097, "tyv": 183067, "tzj": 111356, @@ -768,24 +768,24 @@ "vls": 1197760, "vmf": 5047146, "vmw": 4335630, - "vo": 0, - "vot": 0, + "vo": 20, + "vot": 62, "vro": 68046, "vun": 499219, "wa": 694700, "wae": 11035, "wal": 2133900, "war": 3434688, - "was": 0, + "was": 20, "wbq": 2536434, "wbr": 2113695, - "wen": 0, + "wen": 60000, "wep": 0, "wo": 13203204, "wtm": 6481998, "wuu_Hans": 84962400, "wuu_Hant": 84962400, - "xal": 0, + "xal": 431800, "xh": 10901720, "xnr": 2254608, "xog": 2611999, @@ -798,9 +798,9 @@ "yue_Hans": 73634080, "yue_Hant": 81532220, "za": 4389724, - "zap": 0, + "zap": 777000, "zbl": 0, - "zen": 0, + "zen": 200, "zgh": 8225272, "zh_Hans": 1286444445, "zh_Hans_SG": 4641914, @@ -808,6 +808,6 @@ "zh_Hant_HK": 6932929, "zh_Latn": 0, "zu": 14969090, - "zun": 0, + "zun": 9650, "zza": 1177672, }