From 2681990652297d782cca9bf8bb5af6ba5e9f7634 Mon Sep 17 00:00:00 2001 From: Matjaz Domen Pecan Date: Sat, 18 Apr 2026 11:54:15 +0200 Subject: [PATCH] fix(lexer): treat backtick body as opaque when inner parse fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bash treats the body of `` `...` `` as a single word token at the initial lexing stage — errors inside the backtick are runtime, not parse, concerns. Rable's fork-and-merge parser rejects bodies with reserved words in non-reserved positions, unbalanced conditionals, or unterminated ANSI-C quotes, producing `` output or silently corrupting the outer AST where bash yields a plain word. Add `scan_backtick_opaque` as a fallback in `read_backtick_inner`: on `Err` from `parse_backtick_body`, re-scan the body as raw bytes, treating `\` as a two-byte escape so `` \` `` does not terminate. The `Ok` path is unchanged, so valid backticks still produce the same tokens and spans as today. Unlocks all 6 `backtick_opaque N` cases in `bash_valid_divergences`. Closes #38 --- src/lexer/backtick_opaque_tests.rs | 95 ++++++++++++++++++++++++++++++ src/lexer/expansions.rs | 39 +++++++++++- src/lexer/mod.rs | 2 + tests/integration.rs | 9 +-- 4 files changed, 137 insertions(+), 8 deletions(-) create mode 100644 src/lexer/backtick_opaque_tests.rs diff --git a/src/lexer/backtick_opaque_tests.rs b/src/lexer/backtick_opaque_tests.rs new file mode 100644 index 0000000..52b4f9d --- /dev/null +++ b/src/lexer/backtick_opaque_tests.rs @@ -0,0 +1,95 @@ +//! Tests for the opaque-backtick fallback (issue #38). +//! +//! When `parse_backtick_body` rejects a backtick body, `read_backtick_inner` +//! falls back to a raw byte-level scan for the closing `` ` ``, matching +//! bash's lexing rule that a backtick body is a single word token whose +//! errors (if any) are runtime concerns, not parse-time. + +use super::Lexer; +use crate::error::RableError; +use crate::token::TokenType; + +#[allow(clippy::unwrap_used)] +fn collect_tokens(source: &str) -> Vec<(TokenType, String)> { + let mut lexer = Lexer::new(source, false); + let mut tokens = Vec::new(); + loop { + let tok = lexer.next_token().unwrap(); + if tok.kind == TokenType::Eof { + break; + } + tokens.push((tok.kind, tok.value)); + } + tokens +} + +#[test] +fn invalid_body_becomes_opaque_word() { + // `else echo` — fork fails because `else` at command start is a + // reserved word and cannot begin a simple command. The fallback + // scanner must emit the whole backtick as one Word token. + let tokens = collect_tokens("`else echo`"); + assert_eq!(tokens.len(), 1); + assert_eq!(tokens[0].0, TokenType::Word); + assert_eq!(tokens[0].1, "`else echo`"); +} + +#[test] +fn escape_does_not_terminate() { + // Inside an opaque backtick body, `\` consumes two bytes, + // so an escaped `` ` `` does not falsely terminate the body. + let tokens = collect_tokens("`else \\`then\\` echo`"); + assert_eq!(tokens.len(), 1); + assert_eq!(tokens[0].0, TokenType::Word); + assert_eq!(tokens[0].1, "`else \\`then\\` echo`"); +} + +#[test] +fn literal_newline_escape_consumes_two_bytes() { + // `\n` inside an opaque body is literal backslash-then-n, not a + // newline. The scanner's two-byte escape rule must consume both + // without touching the line counter. + let tokens = collect_tokens("`else a\\nb`"); + assert_eq!(tokens.len(), 1); + assert_eq!(tokens[0].0, TokenType::Word); + assert_eq!(tokens[0].1, "`else a\\nb`"); +} + +#[test] +fn trailing_backslash_at_eof_surfaces_error() { + // A lone trailing `\` with no following byte must still produce + // a MatchedPair error — the inner `if let` is a no-op, the outer + // loop sees EOF, and the scanner reports unterminated backtick. + let mut lexer = Lexer::new("`else\\", false); + assert!(matches!( + lexer.next_token(), + Err(RableError::MatchedPair { .. }), + )); +} + +#[test] +fn unterminated_body_surfaces_error() { + // Invalid body with no closing backtick must surface a + // MatchedPair error rather than silently consuming input. + let mut lexer = Lexer::new("`else echo", false); + assert!(matches!( + lexer.next_token(), + Err(RableError::MatchedPair { .. }), + )); +} + +#[test] +#[allow(clippy::unwrap_used)] +fn newlines_in_body_advance_line_counter() { + // Newlines inside an opaque backtick body must advance the + // line counter so subsequent tokens report the correct line. + let mut lexer = Lexer::new("`else\necho\n`\nok", false); + let bt = lexer.next_token().unwrap(); + assert_eq!(bt.kind, TokenType::Word); + assert_eq!(bt.value, "`else\necho\n`"); + let nl = lexer.next_token().unwrap(); + assert_eq!(nl.kind, TokenType::Newline); + let ok = lexer.next_token().unwrap(); + assert_eq!(ok.value, "ok"); + assert_eq!(ok.line, 4); +} diff --git a/src/lexer/expansions.rs b/src/lexer/expansions.rs index 5bb4da3..b0133a0 100644 --- a/src/lexer/expansions.rs +++ b/src/lexer/expansions.rs @@ -419,7 +419,10 @@ impl Lexer { fn read_backtick_inner(&mut self, wb: &mut WordBuilder) -> Result<()> { let body_start = self.pos; let outer_depth = self.parser_depth(); - let (end_pos, end_line) = crate::parser::parse_backtick_body(self, outer_depth)?; + let (end_pos, end_line) = match crate::parser::parse_backtick_body(self, outer_depth) { + Ok(r) => r, + Err(_) => self.scan_backtick_opaque(body_start)?, + }; wb.value .extend(self.input[body_start..end_pos].iter().copied()); self.pos = end_pos; @@ -427,6 +430,40 @@ impl Lexer { Ok(()) } + /// Raw scan for the closing backtick, used as a fallback when + /// `parse_backtick_body` rejects the body. Bash treats a backtick + /// body as a single word token at the initial lexing stage — errors + /// inside are runtime, not parse, concerns. Issue #38. + /// + /// Only recognizes `\` as a two-byte escape (so an escaped + /// `` ` `` does not terminate). Returns `(end_pos, end_line)` with + /// `end_pos` one past the closing backtick; errors with + /// `MatchedPair` when EOF is reached first. + fn scan_backtick_opaque(&self, body_start: usize) -> Result<(usize, usize)> { + let mut pos = body_start; + let mut line = self.line; + while let Some(c) = self.input.get(pos).copied() { + match c { + '\\' => { + pos += 1; + if let Some(next) = self.input.get(pos).copied() { + if next == '\n' { + line += 1; + } + pos += 1; + } + } + '`' => return Ok((pos + 1, line)), + '\n' => { + line += 1; + pos += 1; + } + _ => pos += 1, + } + } + Err(RableError::matched_pair("unterminated backtick", pos, line)) + } + /// Reads deprecated `$[...]` arithmetic with bracket depth tracking. fn read_deprecated_arith(&mut self, wb: &mut WordBuilder) -> Result<()> { let mut depth = 1; diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 805f0d2..eae7e10 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -13,6 +13,8 @@ mod quotes; pub(super) mod word_builder; mod words; +#[cfg(test)] +mod backtick_opaque_tests; #[cfg(test)] mod tests; diff --git a/tests/integration.rs b/tests/integration.rs index 60c4de2..f8e7f99 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -218,13 +218,8 @@ const KNOWN_ORACLE_FAILURES: &[&str] = &[ // #37 — reserved words as plain words: cases 1, 3, 4 fixed by #44; // case 5 fixed as a side effect of #35; case 2 fixed by #42 // (`((` → nested subshell fallback). - // #38 — backticks opaque on invalid content - "backtick_opaque 1", - "backtick_opaque 2", - "backtick_opaque 3", - "backtick_opaque 4", - "backtick_opaque 5", - "backtick_opaque 6", + // #38 — backticks opaque on invalid content: all 6 cases fixed via + // `scan_backtick_opaque` fallback in `read_backtick_inner`. // #39 — heredoc inside $(...); both cases fixed: case 1 fell out // of #40's `<<'EOF'` fix, case 2 via the sloppy-delimiter path // in `read_heredoc_body` + `reformat_bash` using Cmdsub mode.