From 391b4f219e5c5559ae1496e8a23aee92ce91757c Mon Sep 17 00:00:00 2001 From: Alex Pyattaev Date: Sat, 1 Jun 2024 01:47:15 +0300 Subject: [PATCH] Refine concept of Exactly to do exact matching, as the name implies --- src/core/mod.rs | 2 +- src/core/traits.rs | 52 +++++++++ src/core/type.rs | 274 +++++++++++++++++++++++++++------------------ src/lib.rs | 51 +++++++-- 4 files changed, 259 insertions(+), 120 deletions(-) diff --git a/src/core/mod.rs b/src/core/mod.rs index 501fa08..7103bad 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -2,4 +2,4 @@ mod traits; mod r#type; pub use r#type::{not, Input, Input::*, Type, Type::*}; -pub use traits::{AsRegex, Condition, Error, Result}; +pub use traits::{AsRegex, Condition, Error, Grouping, Result}; diff --git a/src/core/traits.rs b/src/core/traits.rs index 93385ab..f5cf87b 100644 --- a/src/core/traits.rs +++ b/src/core/traits.rs @@ -41,3 +41,55 @@ pub trait Condition: AsRegex + Sized { Regex::new(&format!("(?:{})?", self.to_string())).expect("Invalid regex (optionally)") } } + +pub trait Grouping: AsRegex { + /// This defines the entire input so far as a named capture group. + /// + /// # Example + /// ``` + /// use magic_regexp::{create_reg_exp, Condition,Grouping, Digit, LetterLowercase, OneOrMore}; + /// + /// let regex = create_reg_exp(OneOrMore(Digit).grouped_as("digits")).unwrap(); + /// assert_eq!(®ex.captures("1").unwrap()["digits"], "1"); + /// ``` + fn grouped_as(&self, name: &str) -> Regex { + Regex::new(&format!(r"(?P<{}>{})", name, self.to_string())).expect("Invalid regex") + } + + /// This defines the entire input so far as a named capture group. + /// This is an alias for `grouped_as`. + fn r#as(&self, name: &str) -> Regex { + self.grouped_as(name) + } + + /// This defines the entire input so far as an anonymous group. + /// + /// # Example + /// ``` + /// use magic_regexp::{create_reg_exp, Condition, Grouping, Digit, LetterLowercase, OneOrMore, Type, Char, Whitespace, Maybe}; + /// use regex::Regex; + /// + /// let regex = create_reg_exp(OneOrMore(Digit).grouped() + /// .and(Whitespace) + /// .and(OneOrMore(Char).or(Whitespace).optionally()) + /// .and(OneOrMore(Digit).grouped()) + /// ).unwrap(); + /// assert_eq!(®ex.captures("1 5").unwrap()[1], "1"); + /// assert_eq!(®ex.captures("134 23").unwrap()[1], "134"); + /// // The folloing example is not really useful, because it shows only the first match. + /// // The second match is not captured. See the next example for a more useful example. + /// assert_eq!(®ex.captures("this is the 134 test 213").unwrap()[1], "134"); + /// // This is a bit more complex, because we are using anonymous groups in regex. + /// let cap = ®ex + /// .find_iter("multiple numbers 134 2123") + /// .filter_map(|digits| digits.as_str().parse().ok()) + /// .collect::>() + /// .join(" "); + /// let expected = ["134", "2123"].join(" "); + /// assert_eq!(cap, &expected); + /// ``` + /// + fn grouped(&self) -> Regex { + Regex::new(&format!(r"({})", self.to_string())).expect("Invalid regex") + } +} diff --git a/src/core/type.rs b/src/core/type.rs index 13c4392..6d434c6 100644 --- a/src/core/type.rs +++ b/src/core/type.rs @@ -1,6 +1,6 @@ -use std::fmt::{Display}; -use crate::{AsRegex, Condition, Result}; +use crate::{AsRegex, Condition, Grouping, Result}; use regex::Regex; +use std::fmt::{Display, Write}; /// Represents a regex type. This enum is used to create the smallest regex statement. /// For example, `Type::Digit` will create the regex `\d`. @@ -10,25 +10,41 @@ use regex::Regex; /// use magic_regexp::{OneOrMore, Type::Digit}; /// /// let input = OneOrMore(Digit); -/// assert_eq!(input.to_string(), r"(\d+)"); // Note that the regex is wrapped in parentheses. +/// assert_eq!(input.to_string(), r"\d+"); // Note that the regex is wrapped in parentheses. /// ``` pub enum Type<'a> { + ///0-9 Digit, + ///[^0-9] NotDigit, WordBoundary, NotWordBoundary, + /// contigous sequence of WordChar Word, + /// contigous sequence of characters not in Word class + NotWord, + /// Stuff that you find in words (see UTS18) WordChar, + /// Anything not in WordChar NotWordChar, + /// String literal. Will be escaped if necessary. Text(&'a str), - Options(&'a str), + /// Any character from set provided. + AnyOf(&'a str), + /// Any character not in set provided. + NoneOf(&'a str), + /// Any character Char, Whitespace, NotWhitespace, + /// Any unicode letter Letter, + /// Any unicode non-letter NotLetter, + LetterLowercase, NotLetterLowercase, + LetterUppercase, NotLetterUppercase, Tab, @@ -37,44 +53,108 @@ pub enum Type<'a> { NotLinefeed, CarriageReturn, NotCarriageReturn, + /// Raw input to be passed to regex crate as-is + Raw(&'a str), } -impl<'a> AsRegex for Type<'a> {} +impl AsRegex for Type<'_> {} impl<'a> Display for Type<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let txt; - let str = match self { + let s = match self { + Type::Raw(r) => r, + Type::Digit => r"\d", Type::NotDigit => r"\D", + Type::WordBoundary => r"\b", Type::NotWordBoundary => r"\B", + Type::Word => r"\b\w+\b", + Type::NotWord => r"W+", + Type::WordChar => r"\w", Type::NotWordChar => r"\W", + Type::Char => r".", - Type::Text(text) => text, + Type::Text(text) => { + if text.len() > 1 { + //Wrap text into a non-capturing group so it can be combined with +, * and so on. + f.write_str("(?:")?; + } + + for c in text.chars() { + if must_escape_text(c) { + f.write_char('\\')?; + } + f.write_char(c)?; + } + + if text.len() > 1 { + f.write_char(')')?; + } + return Ok(()); + } + //There is a hack possible to enable "not text" but it is silly inefficient + /*Type::NotText(text) => { + for c in text.chars() { + f.write_str("[^")?; + // Escape if necessary + if must_escape_in_set(c) { + f.write_char('\\')?; + } + f.write_char(c)?; + f.write_char(']')?; + } + return Ok(()); + } */ Type::Whitespace => r"\s", Type::NotWhitespace => r"\S", - Type::Letter => r"[a-zA-Z]", - Type::NotLetter => r"[^a-zA-Z]", - Type::LetterLowercase => r"[a-z]", - Type::NotLetterLowercase => r"[^a-z]", - Type::LetterUppercase => r"[A-Z]", - Type::NotLetterUppercase => r"[^A-Z]", + + Type::Letter => r"\p{Letter}", + Type::NotLetter => r"\P{Letter}", + + Type::LetterLowercase => r"\p{Lowercase Letter}", + Type::NotLetterLowercase => r"\P{Lowercase Letter}", + + Type::LetterUppercase => r"\p{Uppercase Letter}", + Type::NotLetterUppercase => r"\P{Uppercase Letter}", + Type::Tab => r"\t", Type::NotTab => r"^\t", + Type::Linefeed => r"\n", Type::NotLinefeed => r"^\n", + Type::CarriageReturn => r"\r", Type::NotCarriageReturn => r"^\r", - Type::Options(options) => { - txt = format!("[{}]", options); - txt.as_str() + + Type::AnyOf(options) => { + f.write_char('[')?; + for c in options.chars() { + // Escape if necessary + if must_escape_in_set(c) { + f.write_char('\\')?; + } + f.write_char(c)?; + } + f.write_char(']')?; + return Ok(()); } - } - .to_string(); - write!(f, "{}", str) + Type::NoneOf(options) => { + f.write_str("[^")?; // negative set + for c in options.chars() { + // Escape if necessary + if must_escape_in_set(c) { + f.write_char('\\')?; + } + f.write_char(c)?; + } + f.write_char(']')?; // close set + return Ok(()); + } + }; + f.write_str(s) } } @@ -86,53 +166,58 @@ impl<'a> Display for Type<'a> { /// /// # Examples /// ``` -/// use magic_regexp::{OneOrMore, not, Options}; +/// use magic_regexp::{OneOrMore, not, AnyOf}; /// -/// let input = OneOrMore(not(not(Options("01")))); -/// assert_eq!(input.to_string(), r"([01]+)"); +/// let input = OneOrMore(not(not(AnyOf("01")))); +/// assert_eq!(input.to_string(), r"[01]+"); /// ``` pub fn not(t: Type) -> Type { match t { Type::Digit => Type::NotDigit, Type::NotDigit => Type::Digit, + Type::WordBoundary => Type::NotWordBoundary, Type::NotWordBoundary => Type::WordBoundary, + Type::WordChar => Type::NotWordChar, Type::NotWordChar => Type::WordChar, + Type::Whitespace => Type::NotWhitespace, Type::NotWhitespace => Type::Whitespace, + Type::Letter => Type::NotLetter, Type::NotLetter => Type::Letter, + Type::LetterLowercase => Type::NotLetterLowercase, Type::NotLetterLowercase => Type::LetterLowercase, + Type::LetterUppercase => Type::NotLetterUppercase, Type::NotLetterUppercase => Type::LetterUppercase, + Type::Tab => Type::NotTab, Type::NotTab => Type::Tab, + Type::Linefeed => Type::NotLinefeed, Type::NotLinefeed => Type::Linefeed, + Type::CarriageReturn => Type::NotCarriageReturn, Type::NotCarriageReturn => Type::CarriageReturn, - Type::Text(t) => Type::Text(Box::leak(format!("^{}", t).into_boxed_str())), - Type::Options(t) => { - if let Some(first) = t.chars().next() { - let opt: String = if first == '^' { - t[1..].to_string() - } else { - format!("^{}", t) - }; - Type::Options(Box::leak(opt.into_boxed_str())) - } else { - panic!("Invalid options: {}", t); - } - } - _ => t, + + Type::AnyOf(t) => Type::NoneOf(t), + Type::NoneOf(t) => Type::AnyOf(t), + + Type::Word => Type::NotWord, + Type::NotWord => Type::Word, + + Type::Text(_) => unreachable!("Negating Text is not something regex can do"), + Type::Char => unreachable!("Negating Char makes no sense"), + Type::Raw(_) => unreachable!("Negating Raw makes no sense"), } } /// This is a regex input that can be used to match a single character or a group of characters. /// Can be used to create a regex that matches a single character or a group of characters. -/// For example, `Input::Exactly(Type::Digit)` will match a single digit. +/// For example, `Input::Exactly(Type::Digit)` will match a single digit, and nothing else. /// /// # Example /// ``` @@ -141,7 +226,7 @@ pub fn not(t: Type) -> Type { /// let regex = create_reg_exp(Input::Exactly(Type::Digit)).unwrap(); /// assert!(regex.is_match("1")); /// assert!(!regex.is_match("12")); -/// assert!(regex.is_match("1 2")); +/// assert!(!regex.is_match("1 2")); /// ``` /// /// # Example @@ -166,105 +251,80 @@ pub fn not(t: Type) -> Type { /// assert!(regex.is_match("1 2")); /// ``` pub enum Input<'a> { + /// Matches one or more instance of content OneOrMore(Type<'a>), + /// Matches exactly one instance (and NOTHING else), will anchor at start and end of line. Exactly only really makes sense as outer-most tag. Exactly(Type<'a>), + /// Matches if present, ignores if not Maybe(Type<'a>), - Times(Type<'a>, usize), + /// Matches exactly this many times + Times(Type<'a>, usize), //TODO: take range to allow for full feature of this pattern +} + +///Test if a char needs to be escaped for text literals +fn must_escape_text(c: char) -> bool { + const ESCAPE_REPLACE_RE: &str = r".*+?^${}()|[]\/"; + ESCAPE_REPLACE_RE.chars().any(|p| p == c) +} + +///Test if a char needs to be escaped in a set definition +fn must_escape_in_set(c: char) -> bool { + r"\-]".chars().any(|p| p == c) } impl<'a> Display for Input<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - const ESCAPE_REPLACE_RE: &str = r"[.*+?^${}()|[\\]\\/]"; - let str = match self { - Input::OneOrMore(t) => format!("({}+)", t), + Input::OneOrMore(t) => format!("{t}+"), Input::Exactly(t) => match t { - Type::Text(t) => Regex::new(ESCAPE_REPLACE_RE) - .expect("Invalid replace_all regex") - .replace_all(t, r"\$0").parse().unwrap() - , - _ => format!(r"\b{}\b", t), + _ => format!(r"^{t}$"), }, - Input::Maybe(t) => format!("({}?)", t), - Input::Times(t, n) => format!("{}{{{}}}", t, n), + Input::Maybe(t) => format!("(?:{t})?"), + Input::Times(t, n) => format!("{t}{{{n}}}"), }; write!(f, "{}", str) } } -impl<'a> AsRegex for Input<'a> { +impl<'a> std::fmt::Debug for Type<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + Display::fmt(self, f) + } +} + +impl<'a> std::fmt::Debug for Input<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + Display::fmt(self, f) + } +} + +impl AsRegex for Input<'_> { fn as_regex(&self) -> Result { Ok(Regex::new(&self.to_string())?) } } +impl Condition for Input<'_> {} + +impl Condition for Type<'_> {} + /// Returns a Regex, which chains the 2 given regexes with an `and` operator. /// /// # Example /// ``` -/// use magic_regexp::{create_reg_exp, Condition, Exactly, Digit, LetterLowercase}; +/// use magic_regexp::{create_reg_exp, Condition, Digit, LetterLowercase}; /// -/// let regex = create_reg_exp(Exactly(Digit).or(Exactly(LetterLowercase))).unwrap(); +/// let regex = create_reg_exp(Digit.or(LetterLowercase)).unwrap(); /// assert!(regex.is_match("1")); /// assert!(regex.is_match("a")); /// assert!(!regex.is_match("A")); -/// assert!(!regex.is_match("12")); -/// assert!(!regex.is_match("1a")); +/// assert!(regex.is_match("12")); +/// assert!(regex.is_match("1a")); /// assert!(regex.is_match("1 a")); /// ``` -impl<'a> Condition for Input<'a> {} - -impl<'a> Input<'a> { - /// This defines the entire input so far as a named capture group. - /// - /// # Example - /// ``` - /// use magic_regexp::{create_reg_exp, Condition, Exactly, Digit, LetterLowercase, OneOrMore}; - /// - /// let regex = create_reg_exp(OneOrMore(Digit).grouped_as("digits")).unwrap(); - /// assert_eq!(®ex.captures("1").unwrap()["digits"], "1"); - /// ``` - pub fn grouped_as(&self, name: &str) -> Regex { - Regex::new(&format!(r"(?P<{}>{})", name, self.to_string())).expect("Invalid regex") - } - /// This defines the entire input so far as a named capture group. - /// This is an alias for `grouped_as`. - pub fn r#as(&self, name: &str) -> Regex { - self.grouped_as(name) - } - - /// This defines the entire input so far as an anonymous group. - /// - /// # Example - /// ``` - /// use magic_regexp::{create_reg_exp, Condition, Exactly, Digit, LetterLowercase, OneOrMore, Type, Char, Whitespace, Maybe, Options}; - /// use regex::Regex; - /// - /// let regex = create_reg_exp(OneOrMore(Digit).grouped() - /// .and(Exactly(Whitespace)) - /// .and(OneOrMore(Char).or(Exactly(Whitespace)).optionally()) - /// .and(OneOrMore(Digit).grouped()) - /// ).unwrap(); - /// assert_eq!(®ex.captures("1 5").unwrap()[1], "1"); - /// assert_eq!(®ex.captures("134 23").unwrap()[1], "134"); - /// // The folloing example is not really useful, because it shows only the first match. - /// // The second match is not captured. See the next example for a more useful example. - /// assert_eq!(®ex.captures("this is the 134 test 213").unwrap()[1], "134"); - /// // This is a bit more complex, because we are using anonymous groups in regex. - /// let cap = ®ex - /// .find_iter("multiple numbers 134 2123") - /// .filter_map(|digits| digits.as_str().parse().ok()) - /// .collect::>() - /// .join(" "); - /// let expected = ["134", "2123"].join(" "); - /// assert_eq!(cap, &expected); - /// ``` - /// - pub fn grouped(&self) -> Regex { - Regex::new(&format!(r"({})", self.to_string())).expect("Invalid regex") - } -} +impl Grouping for Input<'_> {} +impl Grouping for Type<'_> {} impl AsRegex for Regex {} diff --git a/src/lib.rs b/src/lib.rs index 95c74d1..f59edd9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,14 +7,14 @@ For more specific details on the API for regular expressions, please see around # Example: Find a date ```rust -use magic_regexp::{Digit, Times, create_reg_exp, Exactly, Condition, Text}; +use magic_regexp::{Digit, Times, create_reg_exp, Grouping, Condition, Text}; use regex::Regex; const TO_SEARCH: &'static str = "On 2010-03-14, foo happened. On 2014-10-14, bar happened."; -let input = Times(Digit, 4).and(Exactly(Text("-"))).and(Times(Digit, 2)).and(Exactly(Text(("-")))).and(Times(Digit, 2)); +let input = Times(Digit, 4).and(Text("-")).and(Times(Digit, 2)).and(Text(("-"))).and(Times(Digit, 2)); assert_eq!(input.to_string(), r"\d{4}-\d{2}-\d{2}"); -let input = Times(Digit, 4).grouped_as("year").and(Exactly(Text("-"))).and(Times(Digit, 2).grouped_as("month")).and(Exactly(Text(("-")))).and(Times(Digit, 2).grouped_as("day")); +let input = Times(Digit, 4).grouped_as("year").and(Text("-")).and(Times(Digit, 2).grouped_as("month")).and(Text(("-"))).and(Times(Digit, 2).grouped_as("day")); let re = create_reg_exp(input).unwrap(); assert!(re.is_match("2014-01-01")); assert_eq!(re.find_iter(TO_SEARCH).count(), 2); @@ -41,13 +41,13 @@ use regex::Regex; /// /// # Example /// ``` -/// use magic_regexp::{Digit, Times, create_reg_exp, Exactly, Condition, Text}; +/// use magic_regexp::{Digit, Times, create_reg_exp, Condition, Grouping, Text}; /// use regex::Regex; /// /// const TO_SEARCH: &'static str = "On 2010-03-14, foo happened. On 2014-10-14, bar happened."; -/// let input = Times(Digit, 4).and(Exactly(Text("-"))).and(Times(Digit, 2)).and(Exactly(Text(("-")))).and(Times(Digit, 2)); +/// let input = Times(Digit, 4).and(Text("-")).and(Times(Digit, 2)).and(Text(("-"))).and(Times(Digit, 2)); /// assert_eq!(input.to_string(), r"\d{4}-\d{2}-\d{2}"); -/// let input = Times(Digit, 4).grouped_as("year").and(Exactly(Text("-"))).and(Times(Digit, 2).grouped_as("month")).and(Exactly(Text(("-")))).and(Times(Digit, 2).grouped_as("day")); +/// let input = Times(Digit, 4).grouped_as("year").and(Text("-")).and(Times(Digit, 2).grouped_as("month")).and(Text(("-"))).and(Times(Digit, 2).grouped_as("day")); /// let re = create_reg_exp(input).unwrap(); /// assert!(re.is_match("2014-01-01")); /// assert_eq!(re.find_iter(TO_SEARCH).count(), 2); @@ -67,9 +67,9 @@ pub fn create_reg_exp(input: impl AsRegex) -> Result { #[cfg(test)] mod tests { - use super::{create_reg_exp, not, Exactly, OneOrMore, Type::Digit}; - use crate::Input::Maybe; - use crate::Type::Text; + use super::Input::Maybe; + use super::Type::{Digit, Text, Whitespace, Word}; + use super::{create_reg_exp, not, Condition, Exactly, OneOrMore}; #[test] fn test_single_digit() { @@ -77,7 +77,7 @@ mod tests { let regex = create_reg_exp(input).unwrap(); assert!(regex.is_match("1")); assert!(!regex.is_match("12")); - assert!(regex.is_match("1 2")); + assert!(!regex.is_match("1 2")); } #[test] @@ -86,7 +86,7 @@ mod tests { let regex = create_reg_exp(input).unwrap(); assert!(regex.is_match("1")); assert!(regex.is_match("")); - assert!(regex.is_match("12")); + assert!(regex.is_match("a12")); assert!(regex.is_match("1 2")); } #[test] @@ -105,6 +105,7 @@ mod tests { let input = Exactly(not(Digit)); let regex = create_reg_exp(input).unwrap(); assert!(!regex.is_match("1")); + assert!(!regex.is_match("ab")); assert!(regex.is_match("a")); } @@ -118,9 +119,35 @@ mod tests { #[test] fn test_exactly_text() { - let input = Exactly(Text("welt".into())); + let input = Text("welt"); let regex = create_reg_exp(input).unwrap(); assert!(regex.is_match("Hallo welt")); assert!(!regex.is_match("Hallo Welt")); + + let input = Exactly(Text("Hallo welt")); + let regex = create_reg_exp(input).unwrap(); + assert!(regex.is_match("Hallo welt")); + assert!(!regex.is_match("Hallo Welt")); + } + + #[test] + fn test_non_ascii() { + let input = Exactly(Text("Привет мир")); + let regex = create_reg_exp(input).unwrap(); + assert!(!regex.is_match("Hello world")); + assert!(regex.is_match("Привет мир")); + + let input = Word.and(Whitespace).and(Word); + let regex = create_reg_exp(input).unwrap(); + assert!(regex.is_match("Привет мир")); + } + + #[test] + fn one_or_more() { + use super::{not, AnyOf, OneOrMore}; + + let input = OneOrMore(not(not(AnyOf("01")))); + + assert_eq!(input.to_string(), r"[01]+"); } }