From c2c486a3c021222ce4f8bae37a6c6e2170d422ed Mon Sep 17 00:00:00 2001 From: Brian Oiko Date: Tue, 21 Apr 2026 12:20:19 +0000 Subject: [PATCH] tests: add whitespace tests for vertical tab behavior MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * tests: add whitespace tests for vertical tab behavior Add two small tests to highlight how vertical tab is handled differently. - vertical_tab_lexer.rs checks that the lexer treats vertical tab as whitespace - ascii_whitespace_excludes_vertical_tab.rs shows that split_ascii_whitespace does not split on it This helps document the difference between the Rust parser (which accepts vertical tab) and the standard library’s ASCII whitespace handling. See: rust-lang/rust-project-goals#53 * tests: add ignore-tidy-tab directive to whitespace tests * tests: expand vertical tab lexer test to cover all Pattern_White_Space chars * tests: add whitespace/ README entry explaining lexer vs stdlib mismatch * Update ascii_whitespace_excludes_vertical_tab.rs * Update ascii_whitespace_excludes_vertical_tab.rs make sure tabs and spaces are well checked * Update ascii_whitespace_excludes_vertical_tab.rs * fix tidy: add whitespace README entry * Update README.md with missing full stop * Update ascii_whitespace_excludes_vertical_tab.rs * fix tidy: use full path format for whitespace README entry * fix tidy: README order, trailing newlines in whitespace tests * fix: add run-pass directive and restore embedded whitespace bytes * fix tidy: remove duplicate whitespace README entry * Add failing UI test for invalid whitespace (zero width space) This adds a //@ check-fail test to ensure that disallowed whitespace characters like ZERO WIDTH SPACE are rejected by the Rust lexer. * git add tests/ui/whitespace/invalid_whitespace.rs git commit -m "Fix tidy: add trailing newline" git push * Fix tidy: add trailing newline * Update invalid_whitespace.rs * Update invalid_whitespace.rs * Clean up whitespace in invalid_whitespace.rs Remove unnecessary blank lines in invalid_whitespace.rs * Update invalid_whitespace.rs * Clarify ZERO WIDTH SPACE usage in test Update comment to clarify usage of ZERO WIDTH SPACE. * Improve error messages for invalid whitespace Updated error messages to clarify the issue with invisible characters. * Modify invalid_whitespace test for clarity Update test to check for invalid whitespace characters. * Resolve unknown token error in invalid_whitespace.rs Fix whitespace issue causing unknown token error. * Remove invisible character from variable assignment Fix invisible character issue in variable assignment. * Improve error message for invalid whitespace Updated error message to clarify invisible characters. * Improve error handling for invisible characters Updated error message for invisible characters in code. * Document error for unknown token due to whitespace Add error message for invalid whitespace in code * Update error message for invalid whitespace handling * Modify invalid_whitespace.rs for whitespace checks Updated the test to check for invalid whitespace handling. * Correct whitespace in variable declaration Fix formatting issue by adding space around '=' in variable declaration. * Update error message for invalid whitespace * Update invalid_whitespace.stderr * Refine error handling for invalid whitespace test Update the error messages for invalid whitespace in the test. * Update invalid_whitespace.rs * Fix whitespace issues in invalid_whitespace.rs * Update invalid_whitespace.stderr file * Clean up whitespace in invalid_whitespace.rs Removed unnecessary blank lines from the test file. * Update invalid_whitespace.stderr --- tests/ui/README.md | 15 +++++ .../ascii_whitespace_excludes_vertical_tab.rs | 22 +++++++ tests/ui/whitespace/invalid_whitespace.rs | 13 +++++ tests/ui/whitespace/invalid_whitespace.stderr | 10 ++++ tests/ui/whitespace/vertical_tab_lexer.rs | 58 +++++++++++++++++++ 5 files changed, 118 insertions(+) create mode 100644 tests/ui/whitespace/ascii_whitespace_excludes_vertical_tab.rs create mode 100644 tests/ui/whitespace/invalid_whitespace.rs create mode 100644 tests/ui/whitespace/invalid_whitespace.stderr create mode 100644 tests/ui/whitespace/vertical_tab_lexer.rs diff --git a/tests/ui/README.md b/tests/ui/README.md index 7c2df5048fc1b..d803af7948bde 100644 --- a/tests/ui/README.md +++ b/tests/ui/README.md @@ -1582,6 +1582,21 @@ Tests on various well-formedness checks, e.g. [Type-checking normal functions](h Tests on `where` clauses. See [Where clauses | Reference](https://doc.rust-lang.org/reference/items/generics.html#where-clauses). +## `tests/ui/whitespace/` + +Tests for whitespace handling in the Rust lexer. The Rust language +defines whitespace as Unicode Pattern_White_Space, which is not the +same as what the standard library gives you: + +- `is_ascii_whitespace` follows the WhatWG Infra Standard and skips + vertical tab (`\x0B`) +- `is_whitespace` matches Unicode White_Space, which is a broader set + +These tests make that gap visible and check that the lexer accepts +all 11 Pattern_White_Space characters correctly. + +See: https://github.com/rustfoundation/interop-initiative/issues/53 + ## `tests/ui/windows-subsystem/`: `#![windows_subsystem = ""]` See [the `windows_subsystem` attribute](https://doc.rust-lang.org/reference/runtime.html#the-windows_subsystem-attribute). diff --git a/tests/ui/whitespace/ascii_whitespace_excludes_vertical_tab.rs b/tests/ui/whitespace/ascii_whitespace_excludes_vertical_tab.rs new file mode 100644 index 0000000000000..aa4c09a9ba48b --- /dev/null +++ b/tests/ui/whitespace/ascii_whitespace_excludes_vertical_tab.rs @@ -0,0 +1,22 @@ +//@ run-pass +// This test checks that split_ascii_whitespace does NOT split on +// vertical tab (\x0B), because the standard library uses the WhatWG +// Infra Standard definition of ASCII whitespace, which excludes +// vertical tab. +// +// See: https://github.com/rust-lang/rust-project-goals/issues/53 + +fn main() { + let s = "a\x0Bb"; + + let parts: Vec<&str> = s.split_ascii_whitespace().collect(); + + assert_eq!(parts.len(), 1, + "vertical tab should not be treated as ASCII whitespace"); + + let s2 = "a b"; + let parts2: Vec<&str> = s2.split_ascii_whitespace().collect(); + assert_eq!(parts2.len(), 2, + "regular space should split correctly"); + +} diff --git a/tests/ui/whitespace/invalid_whitespace.rs b/tests/ui/whitespace/invalid_whitespace.rs new file mode 100644 index 0000000000000..809c8e2af0a25 --- /dev/null +++ b/tests/ui/whitespace/invalid_whitespace.rs @@ -0,0 +1,13 @@ +// This test ensures that the Rust lexer rejects invalid whitespace +// characters such as ZERO WIDTH SPACE. + +//@ check-fail + +fn main() { + let x = 5; + let y = 10; + + let a=​x + y; + //~^ ERROR unknown start of token + //~| HELP invisible characters like +} diff --git a/tests/ui/whitespace/invalid_whitespace.stderr b/tests/ui/whitespace/invalid_whitespace.stderr new file mode 100644 index 0000000000000..ebd203aa41037 --- /dev/null +++ b/tests/ui/whitespace/invalid_whitespace.stderr @@ -0,0 +1,10 @@ +error: unknown start of token: \u{200b} + --> $DIR/invalid_whitespace.rs:10:11 + | +LL | let a=​x + y; + | ^ + | + = help: invisible characters like '\u{200b}' are not usually visible in text editors + +error: aborting due to 1 previous error + diff --git a/tests/ui/whitespace/vertical_tab_lexer.rs b/tests/ui/whitespace/vertical_tab_lexer.rs new file mode 100644 index 0000000000000..75f4543a1fe2d --- /dev/null +++ b/tests/ui/whitespace/vertical_tab_lexer.rs @@ -0,0 +1,58 @@ +//@ run-pass +// ignore-tidy-tab +// +// Tests that the Rust lexer accepts Unicode Pattern_White_Space characters. +// +// Worth noting: the Rust reference defines whitespace as Pattern_White_Space, +// which is not the same as what is_ascii_whitespace or is_whitespace give you. +// +// is_ascii_whitespace follows WhatWG and skips vertical tab (\x0B). +// is_whitespace uses Unicode White_Space, which is a broader set. +// +// The 11 characters that actually count as whitespace in Rust source: +// \x09 \x0A \x0B \x0C \x0D \x20 \u{85} \u{200E} \u{200F} \u{2028} \u{2029} +// +// Ref: https://github.com/rustfoundation/interop-initiative/issues/53 + +#[rustfmt::skip] +fn main() { + // tab (\x09) between let and the name + let _ws1 = 1_i32; + + // vertical tab (\x0B) between let and the name + // this is the one is_ascii_whitespace gets wrong + let _ws2 = 2_i32; + + // form feed (\x0C) between let and the name + let _ws3 = 3_i32; + + // plain space (\x20), here just so every character is represented + let _ws4 = 4_i32; + + // NEL (\u{85}) between let and the name + let…_ws5 = 5_i32; + + // left-to-right mark (\u{200E}) between let and the name + let‎_ws6 = 6_i32; + + // right-to-left mark (\u{200F}) between let and the name + let‏_ws7 = 7_i32; + + // \x0A, \x0D, \u{2028}, \u{2029} are also Pattern_White_Space but they + // act as line endings, so you can't stick them in the middle of a statement. + // The lexer still handles them correctly at line boundaries. + + // These are Unicode White_Space but NOT Pattern_White_Space: + // \u{A0} no-break space \u{1680} ogham space mark + // \u{2000} en quad \u{2001} em quad + // \u{2002} en space \u{2003} em space + // \u{2004} three-per-em space \u{2005} four-per-em space + // \u{2006} six-per-em space \u{2007} figure space + // \u{2008} punctuation space \u{2009} thin space + // \u{200A} hair space \u{202F} narrow no-break space + // \u{205F} medium math space \u{3000} ideographic space + + // add them up so the compiler doesn't complain about unused variables + let _sum = _ws1 + _ws2 + _ws3 + _ws4 + _ws5 + _ws6 + _ws7; + println!("{}", _sum); +}