From 2681990652297d782cca9bf8bb5af6ba5e9f7634 Mon Sep 17 00:00:00 2001
From: Matjaz Domen Pecan <matjaz.pecan@gmail.com>
Date: Sat, 18 Apr 2026 11:54:15 +0200
Subject: [PATCH] fix(lexer): treat backtick body as opaque when inner parse
 fails
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bash treats the body of `` `...` `` as a single word token at the
initial lexing stage — errors inside the backtick are runtime, not
parse, concerns. Rable's fork-and-merge parser rejects bodies with
reserved words in non-reserved positions, unbalanced conditionals,
or unterminated ANSI-C quotes, producing `<error>` output or silently
corrupting the outer AST where bash yields a plain word.

Add `scan_backtick_opaque` as a fallback in `read_backtick_inner`:
on `Err` from `parse_backtick_body`, re-scan the body as raw bytes,
treating `\<x>` as a two-byte escape so `` \` `` does not terminate.
The `Ok` path is unchanged, so valid backticks still produce the
same tokens and spans as today.

Unlocks all 6 `backtick_opaque N` cases in `bash_valid_divergences`.

Closes #38
---
 src/lexer/backtick_opaque_tests.rs | 95 ++++++++++++++++++++++++++++++
 src/lexer/expansions.rs            | 39 +++++++++++-
 src/lexer/mod.rs                   |  2 +
 tests/integration.rs               |  9 +--
 4 files changed, 137 insertions(+), 8 deletions(-)
 create mode 100644 src/lexer/backtick_opaque_tests.rs
diff --git a/src/lexer/backtick_opaque_tests.rs b/src/lexer/backtick_opaque_tests.rs
new file mode 100644
index 0000000..52b4f9d
--- /dev/null
+++ b/src/lexer/backtick_opaque_tests.rs
@@ -0,0 +1,95 @@
+//! Tests for the opaque-backtick fallback (issue #38).
+//!
+//! When `parse_backtick_body` rejects a backtick body, `read_backtick_inner`
+//! falls back to a raw byte-level scan for the closing `` ` ``, matching
+//! bash's lexing rule that a backtick body is a single word token whose
+//! errors (if any) are runtime concerns, not parse-time.
+
+use super::Lexer;
+use crate::error::RableError;
+use crate::token::TokenType;
+
+#[allow(clippy::unwrap_used)]
+fn collect_tokens(source: &str) -> Vec<(TokenType, String)> {
+    let mut lexer = Lexer::new(source, false);
+    let mut tokens = Vec::new();
+    loop {
+        let tok = lexer.next_token().unwrap();
+        if tok.kind == TokenType::Eof {
+            break;
+        }
+        tokens.push((tok.kind, tok.value));
+    }
+    tokens
+}
+
+#[test]
+fn invalid_body_becomes_opaque_word() {
+    // `else echo` — fork fails because `else` at command start is a
+    // reserved word and cannot begin a simple command. The fallback
+    // scanner must emit the whole backtick as one Word token.
+    let tokens = collect_tokens("`else echo`");
+    assert_eq!(tokens.len(), 1);
+    assert_eq!(tokens[0].0, TokenType::Word);
+    assert_eq!(tokens[0].1, "`else echo`");
+}
+
+#[test]
+fn escape_does_not_terminate() {
+    // Inside an opaque backtick body, `\<x>` consumes two bytes,
+    // so an escaped `` ` `` does not falsely terminate the body.
+    let tokens = collect_tokens("`else \\`then\\` echo`");
+    assert_eq!(tokens.len(), 1);
+    assert_eq!(tokens[0].0, TokenType::Word);
+    assert_eq!(tokens[0].1, "`else \\`then\\` echo`");
+}
+
+#[test]
+fn literal_newline_escape_consumes_two_bytes() {
+    // `\n` inside an opaque body is literal backslash-then-n, not a
+    // newline. The scanner's two-byte escape rule must consume both
+    // without touching the line counter.
+    let tokens = collect_tokens("`else a\\nb`");
+    assert_eq!(tokens.len(), 1);
+    assert_eq!(tokens[0].0, TokenType::Word);
+    assert_eq!(tokens[0].1, "`else a\\nb`");
+}
+
+#[test]
+fn trailing_backslash_at_eof_surfaces_error() {
+    // A lone trailing `\` with no following byte must still produce
+    // a MatchedPair error — the inner `if let` is a no-op, the outer
+    // loop sees EOF, and the scanner reports unterminated backtick.
+    let mut lexer = Lexer::new("`else\\", false);
+    assert!(matches!(
+        lexer.next_token(),
+        Err(RableError::MatchedPair { .. }),
+    ));
+}
+
+#[test]
+fn unterminated_body_surfaces_error() {
+    // Invalid body with no closing backtick must surface a
+    // MatchedPair error rather than silently consuming input.
+    let mut lexer = Lexer::new("`else echo", false);
+    assert!(matches!(
+        lexer.next_token(),
+        Err(RableError::MatchedPair { .. }),
+    ));
+}
+
+#[test]
+#[allow(clippy::unwrap_used)]
+fn newlines_in_body_advance_line_counter() {
+    // Newlines inside an opaque backtick body must advance the
+    // line counter so subsequent tokens report the correct line.
+    let mut lexer = Lexer::new("`else\necho\n`\nok", false);
+    let bt = lexer.next_token().unwrap();
+    assert_eq!(bt.kind, TokenType::Word);
+    assert_eq!(bt.value, "`else\necho\n`");
+    let nl = lexer.next_token().unwrap();
+    assert_eq!(nl.kind, TokenType::Newline);
+    let ok = lexer.next_token().unwrap();
+    assert_eq!(ok.value, "ok");
+    assert_eq!(ok.line, 4);
+}
diff --git a/src/lexer/expansions.rs b/src/lexer/expansions.rs
index 5bb4da3..b0133a0 100644
--- a/src/lexer/expansions.rs
+++ b/src/lexer/expansions.rs
@@ -419,7 +419,10 @@ impl Lexer {
     fn read_backtick_inner(&mut self, wb: &mut WordBuilder) -> Result<()> {
         let body_start = self.pos;
         let outer_depth = self.parser_depth();
-        let (end_pos, end_line) = crate::parser::parse_backtick_body(self, outer_depth)?;
+        let (end_pos, end_line) = match crate::parser::parse_backtick_body(self, outer_depth) {
+            Ok(r) => r,
+            Err(_) => self.scan_backtick_opaque(body_start)?,
+        };
         wb.value
             .extend(self.input[body_start..end_pos].iter().copied());
         self.pos = end_pos;
@@ -427,6 +430,40 @@ impl Lexer {
         Ok(())
     }
 
+    /// Raw scan for the closing backtick, used as a fallback when
+    /// `parse_backtick_body` rejects the body. Bash treats a backtick
+    /// body as a single word token at the initial lexing stage — errors
+    /// inside are runtime, not parse, concerns. Issue #38.
+    ///
+    /// Only recognizes `\<x>` as a two-byte escape (so an escaped
+    /// `` ` `` does not terminate). Returns `(end_pos, end_line)` with
+    /// `end_pos` one past the closing backtick; errors with
+    /// `MatchedPair` when EOF is reached first.
+    fn scan_backtick_opaque(&self, body_start: usize) -> Result<(usize, usize)> {
+        let mut pos = body_start;
+        let mut line = self.line;
+        while let Some(c) = self.input.get(pos).copied() {
+            match c {
+                '\\' => {
+                    pos += 1;
+                    if let Some(next) = self.input.get(pos).copied() {
+                        if next == '\n' {
+                            line += 1;
+                        }
+                        pos += 1;
+                    }
+                }
+                '`' => return Ok((pos + 1, line)),
+                '\n' => {
+                    line += 1;
+                    pos += 1;
+                }
+                _ => pos += 1,
+            }
+        }
+        Err(RableError::matched_pair("unterminated backtick", pos, line))
+    }
+
     /// Reads deprecated `$[...]` arithmetic with bracket depth tracking.
     fn read_deprecated_arith(&mut self, wb: &mut WordBuilder) -> Result<()> {
         let mut depth = 1;
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index 805f0d2..eae7e10 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -13,6 +13,8 @@ mod quotes;
 pub(super) mod word_builder;
 mod words;
 
+#[cfg(test)]
+mod backtick_opaque_tests;
 #[cfg(test)]
 mod tests;
 
diff --git a/tests/integration.rs b/tests/integration.rs
index 60c4de2..f8e7f99 100644
--- a/tests/integration.rs
+++ b/tests/integration.rs
@@ -218,13 +218,8 @@ const KNOWN_ORACLE_FAILURES: &[&str] = &[
     // #37 — reserved words as plain words: cases 1, 3, 4 fixed by #44;
     //   case 5 fixed as a side effect of #35; case 2 fixed by #42
     //   (`((` → nested subshell fallback).
-    // #38 — backticks opaque on invalid content
-    "backtick_opaque 1",
-    "backtick_opaque 2",
-    "backtick_opaque 3",
-    "backtick_opaque 4",
-    "backtick_opaque 5",
-    "backtick_opaque 6",
+    // #38 — backticks opaque on invalid content: all 6 cases fixed via
+    //   `scan_backtick_opaque` fallback in `read_backtick_inner`.
     // #39 — heredoc inside $(...); both cases fixed: case 1 fell out
     //   of #40's `<<'EOF'` fix, case 2 via the sloppy-delimiter path
     //   in `read_heredoc_body` + `reformat_bash` using Cmdsub mode.