diff --git a/crates/README.md b/crates/README.md new file mode 100644 index 00000000000..75afd1cf111 --- /dev/null +++ b/crates/README.md @@ -0,0 +1,10 @@ +# crates + +This directory contains the crates that make up Edit and its supporting tooling. + +* `edit`: Main editor binary and library
+ It is split apart into a library to allow for benchmarks. +* `lsh`: Syntax-highlighting compiler and runtime +* `lsh-bin`: A small CLI for experimenting with and debugging LSH output +* `stdext`: Shared utility code used across the workspace +* `unicode-gen`: Code generation utilities for Unicode LUTs diff --git a/crates/lsh-bin/src/main.rs b/crates/lsh-bin/src/main.rs index 365212bbbb8..68547c431d5 100644 --- a/crates/lsh-bin/src/main.rs +++ b/crates/lsh-bin/src/main.rs @@ -31,24 +31,24 @@ enum SubCommands { #[derive(FromArgs, PartialEq, Debug)] #[argh(subcommand, name = "compile", description = "Generate Rust code from .lsh files")] struct SubCommandOneCompile { - #[argh(positional, description = "source .lsh file or directory")] - lsh: PathBuf, + #[argh(positional, description = "source .lsh files or directories")] + lsh: Vec, } #[derive(FromArgs, PartialEq, Debug)] #[argh(subcommand, name = "assembly", description = "Generate assembly from .lsh files")] struct SubCommandAssembly { - #[argh(positional, description = "source .lsh file or directory")] - lsh: PathBuf, + #[argh(positional, description = "source .lsh files or directories")] + lsh: Vec, } #[derive(FromArgs, PartialEq, Debug)] #[argh(subcommand, name = "render", description = "Highlight text files")] struct SubCommandRender { - #[argh(positional, description = "source .lsh file or directory")] - lsh: PathBuf, - #[argh(positional, description = "source text file")] + #[argh(option, description = "source text file")] input: PathBuf, + #[argh(positional, description = "source .lsh files or directories")] + lsh: Vec, } pub fn main() { @@ -67,21 +67,32 @@ fn run() -> anyhow::Result<()> { let mut read_lsh = |path: &Path| { if path.is_dir() { generator.read_directory(path) } else { generator.read_file(path) } }; + let mut read_lsh_inputs = |paths: &[PathBuf]| -> anyhow::Result<()> { + if paths.is_empty() { + bail!("At least one .lsh file or directory is required"); + } + + for path in paths { + read_lsh(path)?; + } + + Ok(()) + }; match &command.sub { SubCommands::Compile(cmd) => { - read_lsh(&cmd.lsh)?; + read_lsh_inputs(&cmd.lsh)?; let output = generator.generate_rust()?; _ = stdout().write_all(output.as_bytes()); } SubCommands::Assembly(cmd) => { - read_lsh(&cmd.lsh)?; + read_lsh_inputs(&cmd.lsh)?; let vt = stdout().is_terminal(); let output = generator.generate_assembly(vt)?; _ = stdout().write_all(output.as_bytes()); } SubCommands::Render(cmd) => { - read_lsh(&cmd.lsh)?; + read_lsh_inputs(&cmd.lsh)?; run_render(generator, &cmd.input)?; } } diff --git a/crates/lsh/README.md b/crates/lsh/README.md new file mode 100644 index 00000000000..bf51865266e --- /dev/null +++ b/crates/lsh/README.md @@ -0,0 +1,26 @@ +# lsh + +`lsh` contains the compiler and runtime for Edit's syntax-highlighting system. + +At a high level: +* Language definitions live in `definitions/*.lsh` +* The compiler lowers them into bytecode +* The runtime executes the bytecode on the input text line by line + +To understand the definition language itself, read [definitions/README.md](definitions/README.md). + +For debugging and optimizing language definitions use `lsh-bin`. +To see the generated assembly, for example: +```sh +# Show the generated assembly of a file or directory +cargo run -p lsh-bin -- assembly crates/lsh/definitions/diff.lsh + +# Due to the lack of include statements, you must specify included files manually. +# Here, git_commit.lsh implicitly relies on diff() from diff.lsh. +cargo run -p lsh-bin -- assembly crates/lsh/definitions/git_commit.lsh crates/lsh/definitions/diff.lsh +``` + +Or to render a file: +```sh +cargo run -p lsh-bin -- render --input assets/highlighting-tests/html.html crates/lsh/definitions +``` diff --git a/crates/lsh/definitions/README.md b/crates/lsh/definitions/README.md new file mode 100644 index 00000000000..b0a29f609f3 --- /dev/null +++ b/crates/lsh/definitions/README.md @@ -0,0 +1,153 @@ +# LSH Definitions + +This directory contains syntax highlighting definitions. +Each `.lsh` file describes how to highlight one or more file types. +The compiler turns these definitions into bytecode, and the runtime executes that bytecode against the input one line at a time. + +Essentially, LSH is a small, line-oriented coroutine language for writing lexers. + +## The basic idea + +Most definitions follow the same pattern: +* Select a definition by file name or path +* Walk the current line from left to right +* Try regexes at the current position +* `yield` highlight kinds as tokens are recognized +* Use `await input` only when a construct needs to continue onto the next line + +## A minimal definition + +A definition is a `pub fn` with attributes that tell the editor when to use it: + +```rs +#[display_name = "Diff"] +#[path = "**/*.diff"] +#[path = "**/*.patch"] +pub fn diff() { + if /(?:diff|---|\+\+\+).*/ { + yield meta.header; + } else if /-.*/ { + yield markup.deleted; + } else if /\+.*/ { + yield markup.inserted; + } +} +``` + +`#[display_name]` sets the human-readable name. +`#[path]` is a glob pattern; you can have as many as you need. +Functions without `pub` are private helpers that can be called from other definitions. + +## How execution works + +The runtime feeds input to a definition one line at a time. +Within a line, matching is always left to right. + +Each `if /regex/` tries to match at the current position: +* On success, the input position advances past the match and the block runs +* On failure, the input position does not move and the `else` branch, if any, runs + +Definitions behave like coroutines: +* If execution reaches `await input`, the function suspends and resumes on the next line +* If the function returns, the next line starts again from the top of the function + +## Highlighting with `yield` + +`yield ` emits a highlight span. +Everything between the previous `yield` and the current position is colored with ``. + +> [!NOTE] +> This can be confusing in practice, because `yield` does not just color the regex it appears in. +> Long term, the goal is for `yield` to apply only to the regex it appears in, or to some other explicitly specified range. + +Highlight kinds are dotted identifiers such as `comment`, `string`, `keyword.control`, `constant.numeric`, and `markup.bold`. +Kinds are interned at compile time. You can invent new ones, but the editor still needs to know what color to assign them. + +`yield other` switches back to the default, unhighlighted kind. +Use it when you want to reset the current highlight between tokens. See [json.lsh](json.lsh) for a representative pattern. + +## Multi-line constructs + +Single-line constructs need no special handling. +For constructs that can span lines, such as block comments or fenced code blocks, combine `loop` or `until` with `await input`: + +```rs +if /\/\*/ { + loop { + yield comment; + await input; + if /\*\// { + yield comment; + break; + } + } +} +``` + +`await input` means "advance to the next line if there is no more input to consume here." +If there is still unconsumed text on the current line, it is a no-op and execution continues immediately. + +One important detail: if you want the remainder of the current line to stay highlighted, emit the appropriate `yield` before `await input`. + +## Control flow + +| Expression | Meaning | +|------------|---------| +| `if /pat/ { ... }` | Match `pat` at the current position and enter the block on success | +| `else if /pat/ { ... }` | Try another pattern if the previous one failed | +| `else { ... }` | Fallback branch | +| `loop { ... }` | Loop until `break`, `continue`, or `return` | +| `until /pat/ { ... }` | Repeat the body until `pat` matches, then consume the match and exit | +| `break` | Exit the innermost loop | +| `continue` | Restart the innermost loop | +| `return` | Exit the current function | + +`until /$/ { ... }` is the usual way to say "keep processing until end-of-line." + +## Capture groups + +Regexes can have capture groups. +Use `yield $N as ` when only part of the match should receive a specific highlight: + +```rs +if /([\w:.-]+)\s*=/ { + yield $1 as variable; + yield other; +} +``` + +The full regex match is still consumed. +Only capture group `$1` receives the `variable` highlight; everything else falls through to the following `yield`. + +## Variables and the input position + +You can store the current input offset in a variable and compare against it later: + +```rs +var indentation = off; +// ...later... +if off <= indentation { + break; +} +``` + +`off` is the built-in register for the current position in the line. +[yaml.lsh](yaml.lsh) uses this pattern to detect when a multi-line string ends. + +## Calling other definitions + +Definitions can call helper functions or other definitions. +This is how [markdown.lsh](markdown.lsh) delegates the contents of fenced code blocks: + +```rs +if /(?i:json)/ { + loop { + await input; + if /\s*```/ { return; } + else { json(); if /.*/ {} } + } +} +``` + +The `if /.*/ {}` at the end consumes any text that the nested definition did not consume itself. +Without that final match, `await input` would see remaining input on the current line and continue immediately instead of advancing to the next line. diff --git a/crates/lsh/src/lib.rs b/crates/lsh/src/lib.rs index 8f914e202ca..328fbcd1e3e 100644 --- a/crates/lsh/src/lib.rs +++ b/crates/lsh/src/lib.rs @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -//! Welcome to Leonard's Syntax Highlighter (LSH), otherwise known as +//! Welcome to the Lightweight Syntax Highlighter (LSH), otherwise known as //! Leonard's Shitty Highlighter, which is really what it is. //! //! ## Architecture