diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 62c0f8825..2e68cdcfc 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -37,7 +37,7 @@ jobs: zip -rq docs-${TAG_NAME}.zip doc doc-data - name: Release - uses: softprops/action-gh-release@v2 + uses: softprops/action-gh-release@v3 with: files: | docs/docs-${{ github.ref_name }}.tar.gz diff --git a/UnicodeBasic.lean b/UnicodeBasic.lean index b55675191..eca1c62aa 100644 --- a/UnicodeBasic.lean +++ b/UnicodeBasic.lean @@ -58,6 +58,27 @@ namespace Unicode @[inline] def getName (char : Char) : String := lookupName char.val +/-! + ## Script ## +-/ + +/-- Get character script + + Unicode property: `Script` +-/ +@[inline] +def getScript (char : Char) : Script := lookupScript char.val + +/-- Get script name + + Returns `none` if the script code is unassigned. + + Unicode property: `Script` +-/ +@[inline] +def getScriptName? (s : Script) : Option String := + lookupScriptName s |>.map toString + /-! ## Bidirectional Properties ## -/ diff --git a/UnicodeBasic/TableLookup.lean b/UnicodeBasic/TableLookup.lean index d4a1eb9dd..2733a4bce 100644 --- a/UnicodeBasic/TableLookup.lean +++ b/UnicodeBasic/TableLookup.lean @@ -22,6 +22,9 @@ protected abbrev oMath : UInt64 := 0x800000000 @[extern "unicode_prop_lookup"] protected opaque lookupProp (c : UInt32) : UInt64 +@[extern "unicode_script_lookup"] +protected opaque lookupScript (c : UInt32) : Script + end CLib /-- Binary search -/ @@ -386,4 +389,22 @@ where str : String := include_str "../data/table/White_Space.txt" table : Thunk <| Array (UInt32 × UInt32) := parsePropTable str +/-- Get the script of a code point using lookup table + + Unicode property: `Script` -/ +@[inline] +def lookupScript (c : UInt32) : Script := CLib.lookupScript c + +/-- Get the name of a script + + Unicode property: `Script` -/ +def lookupScriptName (s : Script) : Option String.Slice := + let table := table.get + if s.code < table[0]!.1 then none else + match table[find s.code (fun i => table[i]!.1) 0 table.size.toUSize]! with + | (c, v) => if s.code = c then some v else none +where + str : String := include_str "../data/table/Script_Name.txt" + table : Thunk <| Array (UInt32 × String.Slice) := parseTable str fun _ n => n[0]! + end Unicode diff --git a/UnicodeBasic/Types.lean b/UnicodeBasic/Types.lean index ca6fd1598..ee0505525 100644 --- a/UnicodeBasic/Types.lean +++ b/UnicodeBasic/Types.lean @@ -3,6 +3,8 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ +import Std.Data.HashMap + /-- Low-level conversion from `UInt32` to `Char` (*unsafe*) This function translates to a no-op in the compiler. However, it does not @@ -1011,4 +1013,64 @@ def BidiClass.ofAbbrev! (abbr : String.Slice) : BidiClass := instance : Repr BidiClass where reprPrec bc _ := s!"Unicode.BidiClass.{bc.toAbbrev}" +/-! + ## Scripts ## +-/ + +/-- Check if valid script identifier -/ +@[inline] +def Script.isValid (c : UInt32) : Bool := + let c0 := (c >>> 24).toUInt8 + let c1 := (c >>> 16).toUInt8 + let c2 := (c >>> 8).toUInt8 + let c3 := c.toUInt8 + (c0 ≤ 'Z'.toUInt8 && 'A'.toUInt8 ≤ c0) + && (c1 ≤ 'z'.toUInt8 && 'a'.toUInt8 ≤ c1) + && (c2 ≤ 'z'.toUInt8 && 'a'.toUInt8 ≤ c2) + && (c3 ≤ 'z'.toUInt8 && 'a'.toUInt8 ≤ c3) + +/-- Script identifier type -/ +structure Script where + code : UInt32 + is_valid : Script.isValid code +deriving DecidableEq, Hashable + +namespace Script + +/-- Default value is `Zzzz` (`Unknown`) -/ +instance : Inhabited Script where + default := { + code := (((('Z'.val <<< 8 ||| 'z'.val) <<< 8) ||| 'z'.val) <<< 8) ||| 'z'.val + is_valid := by decide + } + +/-- String abbreviation of script -/ +@[extern "unicode_script_to_abbrev"] +def toAbbrev : Script → String + | ⟨c, _⟩ => + let c0 := Char.ofUInt8 (c >>> 24).toUInt8 + let c1 := Char.ofUInt8 (c >>> 16).toUInt8 + let c2 := Char.ofUInt8 (c >>> 8).toUInt8 + let c3 := Char.ofUInt8 c.toUInt8 + String.ofList [c0, c1, c2, c3] + +@[extern "unicode_script_of_abbrev"] +private opaque ofAbbrevAux (abbr : String) : UInt32 + +/-- Get script from abbreviation -/ +def ofAbbrev? (abbr : String.Slice) : Option Script := + if abbr.utf8ByteSize = 4 then + let code := ofAbbrevAux abbr.toString + if h : Script.isValid code then + some ⟨code, h⟩ + else + none + else + none + +@[inline, inherit_doc ofAbbrev?] +def ofAbbrev! (abbr : String.Slice) : Script := ofAbbrev? abbr |>.get! + +end Script + end Unicode diff --git a/UnicodeCLib/basic.c b/UnicodeCLib/basic.c index df7c30123..be91265f0 100644 --- a/UnicodeCLib/basic.c +++ b/UnicodeCLib/basic.c @@ -1,5 +1,9 @@ #include +#ifdef _WIN32 +#include +#else #include +#endif #include "basic.h" static inline int unicode_script_is_valid(uint32_t c) { diff --git a/UnicodeCLib/basic.h b/UnicodeCLib/basic.h index 752bcc567..9a973b54c 100644 --- a/UnicodeCLib/basic.h +++ b/UnicodeCLib/basic.h @@ -69,6 +69,12 @@ LEAN_EXPORT uint64_t unicode_prop_lookup(uint32_t c); LEAN_EXPORT uint64_t unicode_case_lookup(uint32_t c); +LEAN_EXPORT uint32_t unicode_script_lookup(uint32_t c); + +LEAN_EXPORT uint32_t unicode_script_of_abbrev(b_lean_obj_arg abbr); + +LEAN_EXPORT lean_obj_res unicode_script_to_abbrev(uint32_t scr); + #ifdef __cplusplus } #endif diff --git a/UnicodeData.lean b/UnicodeData.lean index 760376acf..5aeb9a3e2 100644 --- a/UnicodeData.lean +++ b/UnicodeData.lean @@ -6,3 +6,4 @@ Released under Apache 2.0 license as described in the file LICENSE. import UnicodeData.Aliases import UnicodeData.Basic import UnicodeData.PropList +import UnicodeData.Scripts diff --git a/lakefile.lean b/lakefile.lean index 6b41b54a4..9b2d0eb68 100644 --- a/lakefile.lean +++ b/lakefile.lean @@ -62,4 +62,5 @@ script updateUCD do downloadUCD [ "PropList.txt", "PropertyAliases.txt", "PropertyValueAliases.txt", + "Scripts.txt", ] diff --git a/makeCLib.lean b/makeCLib.lean index 0157bdcbc..79d5e611b 100644 --- a/makeCLib.lean +++ b/makeCLib.lean @@ -191,7 +191,45 @@ static const unicode_data_t table[] = {" file.putStrLn s!"\{UINT32_C({c₀}),UINT32_C({c₁}),UINT64_C({d})}," file.putStrLn "};" +def scriptTable : Array (UInt32 × UInt32 × UInt64) := + let t := Scripts.data.toArray.flatMap fun (sc, t) => + let sc := Script.ofAbbrev! <| PropertyValueAliases.getShortName! "sc" sc + t.map fun (c₀, c₁) => (c₀, c₁, sc.code.toUInt64) + t.qsort fun (a, _) (b, _) => a < b + +def makeScriptC : IO Unit := + IO.FS.withFile (clibDir / "script.c") .write fun file => do + file.putStrLn s!"/* THIS IS A GENERATED FILE - DO NOT EDIT */ +#include \"basic.h\" + +#define TABLE_SIZE {scriptTable.size} +#define SCRIPT_UNKNOWN UINT32_C({Script.ofAbbrev! "Zzzz" |>.code})" + file.putStrLn " +static const unicode_data_t table[TABLE_SIZE]; + +uint32_t unicode_script_lookup(uint32_t c) { + unicode_data_t v; + if (c >= table[0].cmin) { + unsigned int lo = 0; + unsigned int hi = TABLE_SIZE; + unsigned int i = (lo + hi)/2; + while (i != lo) { + if (c < table[i].cmin) hi = i; + else lo = i; + i = (lo + hi)/2; + } + if (c <= table[i].cmax) return (uint32_t)table[i].data; + } + return SCRIPT_UNKNOWN; +} + +static const unicode_data_t table[] = {" + for (c₀, c₁, d) in scriptTable do + file.putStrLn s!"\{UINT32_C({c₀}),UINT32_C({c₁}),UINT64_C({d})}," + file.putStrLn "};" + def main : IO UInt32 := do makeCaseC makePropC + makeScriptC return 0 diff --git a/makeTables.lean b/makeTables.lean index dd45121e1..f7c0e1a7c 100644 --- a/makeTables.lean +++ b/makeTables.lean @@ -520,6 +520,12 @@ def mkWhiteSpace : Array (UInt32 × UInt32) := | (c₀, some c₁) => (c₀, c₁) | (c₀, none) => (c₀, c₀) +def mkScriptName : Array (UInt32 × String) := + let t := PropertyAliases.getValues! "Script" |>.map fun name => + let s := Script.ofAbbrev! <| PropertyValueAliases.getShortName! "Script" name + (s.code, name.toString) + t.qsort fun (a, _) (b, _) => a < b + def main (args : List String) : IO UInt32 := do let args := if args != [] then args else [ "Bidi_Class", @@ -530,6 +536,7 @@ def main (args : List String) : IO UInt32 := do "Default_Ignorable_Code_Point", "Name", "Numeric_Value", + "Script_Name", "White_Space"] let tableDir : System.FilePath := "."/"data"/"table" IO.FS.createDirAll tableDir @@ -767,6 +774,13 @@ def main (args : List String) : IO UInt32 := do else file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁ IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}" + | "Script_Name" => + IO.println s!"Generating table {arg}" + let table := mkScriptName + IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do + for (c, name) in table do + file.putStrLn <| toHexStringAux c ++ ";" ++ name + IO.println s!"Size: {table.size}" | "Titlecase" => IO.println s!"Generating table {arg}" let table ← mkTitlecase