Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
zip -rq docs-${TAG_NAME}.zip doc doc-data

- name: Release
uses: softprops/action-gh-release@v2
uses: softprops/action-gh-release@v3
with:
files: |
docs/docs-${{ github.ref_name }}.tar.gz
Expand Down
21 changes: 21 additions & 0 deletions UnicodeBasic.lean
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,27 @@ namespace Unicode
@[inline]
def getName (char : Char) : String := lookupName char.val

/-!
## Script ##
-/

/-- Get character script

Unicode property: `Script`
-/
@[inline]
def getScript (char : Char) : Script := lookupScript char.val

/-- Get script name

Returns `none` if the script code is unassigned.

Unicode property: `Script`
-/
@[inline]
def getScriptName? (s : Script) : Option String :=
lookupScriptName s |>.map toString

/-!
## Bidirectional Properties ##
-/
Expand Down
21 changes: 21 additions & 0 deletions UnicodeBasic/TableLookup.lean
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ protected abbrev oMath : UInt64 := 0x800000000
@[extern "unicode_prop_lookup"]
protected opaque lookupProp (c : UInt32) : UInt64

@[extern "unicode_script_lookup"]
protected opaque lookupScript (c : UInt32) : Script

end CLib

/-- Binary search -/
Expand Down Expand Up @@ -386,4 +389,22 @@ where
str : String := include_str "../data/table/White_Space.txt"
table : Thunk <| Array (UInt32 × UInt32) := parsePropTable str

/-- Get the script of a code point using lookup table

Unicode property: `Script` -/
@[inline]
def lookupScript (c : UInt32) : Script := CLib.lookupScript c

/-- Get the name of a script

Unicode property: `Script` -/
def lookupScriptName (s : Script) : Option String.Slice :=
let table := table.get
if s.code < table[0]!.1 then none else
match table[find s.code (fun i => table[i]!.1) 0 table.size.toUSize]! with
| (c, v) => if s.code = c then some v else none
where
str : String := include_str "../data/table/Script_Name.txt"
table : Thunk <| Array (UInt32 × String.Slice) := parseTable str fun _ n => n[0]!

end Unicode
62 changes: 62 additions & 0 deletions UnicodeBasic/Types.lean
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
-/

import Std.Data.HashMap

/-- Low-level conversion from `UInt32` to `Char` (*unsafe*)

This function translates to a no-op in the compiler. However, it does not
Expand Down Expand Up @@ -1011,4 +1013,64 @@ def BidiClass.ofAbbrev! (abbr : String.Slice) : BidiClass :=
instance : Repr BidiClass where
reprPrec bc _ := s!"Unicode.BidiClass.{bc.toAbbrev}"

/-!
## Scripts ##
-/

/-- Check if valid script identifier -/
@[inline]
def Script.isValid (c : UInt32) : Bool :=
let c0 := (c >>> 24).toUInt8
let c1 := (c >>> 16).toUInt8
let c2 := (c >>> 8).toUInt8
let c3 := c.toUInt8
(c0 ≤ 'Z'.toUInt8 && 'A'.toUInt8 ≤ c0)
&& (c1 ≤ 'z'.toUInt8 && 'a'.toUInt8 ≤ c1)
&& (c2 ≤ 'z'.toUInt8 && 'a'.toUInt8 ≤ c2)
&& (c3 ≤ 'z'.toUInt8 && 'a'.toUInt8 ≤ c3)

/-- Script identifier type -/
structure Script where
code : UInt32
is_valid : Script.isValid code
deriving DecidableEq, Hashable

namespace Script

/-- Default value is `Zzzz` (`Unknown`) -/
instance : Inhabited Script where
default := {
code := (((('Z'.val <<< 8 ||| 'z'.val) <<< 8) ||| 'z'.val) <<< 8) ||| 'z'.val
is_valid := by decide
}

/-- String abbreviation of script -/
@[extern "unicode_script_to_abbrev"]
def toAbbrev : Script → String
| ⟨c, _⟩ =>
let c0 := Char.ofUInt8 (c >>> 24).toUInt8
let c1 := Char.ofUInt8 (c >>> 16).toUInt8
let c2 := Char.ofUInt8 (c >>> 8).toUInt8
let c3 := Char.ofUInt8 c.toUInt8
String.ofList [c0, c1, c2, c3]

@[extern "unicode_script_of_abbrev"]
private opaque ofAbbrevAux (abbr : String) : UInt32

/-- Get script from abbreviation -/
def ofAbbrev? (abbr : String.Slice) : Option Script :=
if abbr.utf8ByteSize = 4 then
let code := ofAbbrevAux abbr.toString
if h : Script.isValid code then
some ⟨code, h⟩
else
none
else
none

@[inline, inherit_doc ofAbbrev?]
def ofAbbrev! (abbr : String.Slice) : Script := ofAbbrev? abbr |>.get!

end Script

end Unicode
4 changes: 4 additions & 0 deletions UnicodeCLib/basic.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
#include <lean/lean.h>
#ifdef _WIN32
#include <winsock2.h>
#else
#include <arpa/inet.h>
#endif
#include "basic.h"

static inline int unicode_script_is_valid(uint32_t c) {
Expand Down
6 changes: 6 additions & 0 deletions UnicodeCLib/basic.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ LEAN_EXPORT uint64_t unicode_prop_lookup(uint32_t c);

LEAN_EXPORT uint64_t unicode_case_lookup(uint32_t c);

LEAN_EXPORT uint32_t unicode_script_lookup(uint32_t c);

LEAN_EXPORT uint32_t unicode_script_of_abbrev(b_lean_obj_arg abbr);

LEAN_EXPORT lean_obj_res unicode_script_to_abbrev(uint32_t scr);

#ifdef __cplusplus
}
#endif
1 change: 1 addition & 0 deletions UnicodeData.lean
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ Released under Apache 2.0 license as described in the file LICENSE.
import UnicodeData.Aliases
import UnicodeData.Basic
import UnicodeData.PropList
import UnicodeData.Scripts
1 change: 1 addition & 0 deletions lakefile.lean
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,5 @@ script updateUCD do downloadUCD [
"PropList.txt",
"PropertyAliases.txt",
"PropertyValueAliases.txt",
"Scripts.txt",
]
38 changes: 38 additions & 0 deletions makeCLib.lean
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,45 @@ static const unicode_data_t table[] = {"
file.putStrLn s!"\{UINT32_C({c₀}),UINT32_C({c₁}),UINT64_C({d})},"
file.putStrLn "};"

def scriptTable : Array (UInt32 × UInt32 × UInt64) :=
let t := Scripts.data.toArray.flatMap fun (sc, t) =>
let sc := Script.ofAbbrev! <| PropertyValueAliases.getShortName! "sc" sc
t.map fun (c₀, c₁) => (c₀, c₁, sc.code.toUInt64)
t.qsort fun (a, _) (b, _) => a < b

def makeScriptC : IO Unit :=
IO.FS.withFile (clibDir / "script.c") .write fun file => do
file.putStrLn s!"/* THIS IS A GENERATED FILE - DO NOT EDIT */
#include \"basic.h\"

#define TABLE_SIZE {scriptTable.size}
#define SCRIPT_UNKNOWN UINT32_C({Script.ofAbbrev! "Zzzz" |>.code})"
file.putStrLn "
static const unicode_data_t table[TABLE_SIZE];

uint32_t unicode_script_lookup(uint32_t c) {
unicode_data_t v;
if (c >= table[0].cmin) {
unsigned int lo = 0;
unsigned int hi = TABLE_SIZE;
unsigned int i = (lo + hi)/2;
while (i != lo) {
if (c < table[i].cmin) hi = i;
else lo = i;
i = (lo + hi)/2;
}
if (c <= table[i].cmax) return (uint32_t)table[i].data;
}
return SCRIPT_UNKNOWN;
}

static const unicode_data_t table[] = {"
for (c₀, c₁, d) in scriptTable do
file.putStrLn s!"\{UINT32_C({c₀}),UINT32_C({c₁}),UINT64_C({d})},"
file.putStrLn "};"

def main : IO UInt32 := do
makeCaseC
makePropC
makeScriptC
return 0
14 changes: 14 additions & 0 deletions makeTables.lean
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,12 @@ def mkWhiteSpace : Array (UInt32 × UInt32) :=
| (c₀, some c₁) => (c₀, c₁)
| (c₀, none) => (c₀, c₀)

def mkScriptName : Array (UInt32 × String) :=
let t := PropertyAliases.getValues! "Script" |>.map fun name =>
let s := Script.ofAbbrev! <| PropertyValueAliases.getShortName! "Script" name
(s.code, name.toString)
t.qsort fun (a, _) (b, _) => a < b

def main (args : List String) : IO UInt32 := do
let args := if args != [] then args else [
"Bidi_Class",
Expand All @@ -530,6 +536,7 @@ def main (args : List String) : IO UInt32 := do
"Default_Ignorable_Code_Point",
"Name",
"Numeric_Value",
"Script_Name",
"White_Space"]
let tableDir : System.FilePath := "."/"data"/"table"
IO.FS.createDirAll tableDir
Expand Down Expand Up @@ -767,6 +774,13 @@ def main (args : List String) : IO UInt32 := do
else
file.putStrLn <| toHexStringAux c₀ ++ ";" ++ toHexStringAux c₁
IO.println s!"Size: {(statsProp table).1} + {(statsProp table).2}"
| "Script_Name" =>
IO.println s!"Generating table {arg}"
let table := mkScriptName
IO.FS.withFile (tableDir/(arg ++ ".txt")) .write fun file => do
for (c, name) in table do
file.putStrLn <| toHexStringAux c ++ ";" ++ name
IO.println s!"Size: {table.size}"
| "Titlecase" =>
IO.println s!"Generating table {arg}"
let table ← mkTitlecase
Expand Down
Loading