diff --git a/README.md b/README.md index 75b44d4..6a720fa 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ Let's look at a few cases: Seems simple enough. But then we also have these cases: * `https://en.wikipedia.org/wiki/Link_(The_Legend_of_Zelda)` should include the trailing paren +* `http://example.com/./` should include the dot because the slash makes it a path * `http://üñîçøðé.com/ä` should also work for Unicode (including Emoji and Punycode) * `` should not include angle brackets diff --git a/src/url.rs b/src/url.rs index 3f106e9..c1bfc9b 100644 --- a/src/url.rs +++ b/src/url.rs @@ -207,7 +207,7 @@ fn find_url_end(s: &str, quote: Option, iri_parsing_enabled: bool) -> Opti let mut curly = 0; let mut single_quote = false; - let mut previous_can_be_last = true; + let mut previous_is_url_char = true; let mut end = Some(0); if !s[0..].starts_with("/") && !s[0..].starts_with("?") { @@ -232,9 +232,12 @@ fn find_url_end(s: &str, quote: Option, iri_parsing_enabled: bool) -> Opti false } '/' => { - // This may be part of an URL and at the end, but not if the previous character - // can't be the end of an URL - previous_can_be_last + // A slash can be the end of a URL if the previous character is a valid URL + // character. This means that delimiters like `!` or `.` before a `/` are + // included in the URL, e.g. `/!/`, but non-URL characters (like non-ASCII + // when IRI parsing is disabled) before a `/` are not. + // See https://github.com/robinst/linkify/issues/90 + previous_is_url_char } '(' => { round += 1; @@ -288,7 +291,12 @@ fn find_url_end(s: &str, quote: Option, iri_parsing_enabled: bool) -> Opti if can_be_last { end = Some(i + c.len_utf8()); } - previous_can_be_last = can_be_last; + // Track whether the current character is a valid URL character (even if it can't + // be the last character). Delimiters like `!` are valid URL characters, but + // non-ASCII characters are not valid URL characters when IRI parsing is disabled. + // This matters for `/` above: a slash after a delimiter like `!` should extend + // the URL (e.g. `/!/`), but a slash after a non-URL character should not. + previous_is_url_char = c.is_ascii() || iri_parsing_enabled; } end diff --git a/tests/url.rs b/tests/url.rs index 201b90e..41b1650 100644 --- a/tests/url.rs +++ b/tests/url.rs @@ -151,6 +151,30 @@ fn delimiter_at_end_no_protocol() { assert_urls_without_protocol("example.org/;", "|example.org/|;"); } +#[test] +fn delimiter_followed_by_slash() { + // https://github.com/robinst/linkify/issues/90 + assert_linked("http://example.org/!/", "|http://example.org/!/|"); + assert_linked("http://example.org/test!/", "|http://example.org/test!/|"); + assert_linked("http://example.org/./", "|http://example.org/./|"); + assert_linked("http://example.org/,/", "|http://example.org/,/|"); + assert_linked("http://example.org/:/", "|http://example.org/:/|"); + assert_linked("http://example.org/;/", "|http://example.org/;/|"); + assert_linked("http://example.org/?/", "|http://example.org/?/|"); + // Delimiter followed by slash and more path + assert_linked("http://example.org/!/a", "|http://example.org/!/a|"); + assert_linked("http://example.org/test!/a", "|http://example.org/test!/a|"); +} + +#[test] +fn delimiter_followed_by_slash_no_protocol() { + assert_urls_without_protocol("example.org/!/", "|example.org/!/|"); + assert_urls_without_protocol("example.org/test!/", "|example.org/test!/|"); + assert_urls_without_protocol("example.org/./", "|example.org/./|"); + assert_urls_without_protocol("example.org/,/", "|example.org/,/|"); + assert_urls_without_protocol("example.org/;/", "|example.org/;/|"); +} + #[test] fn matching_punctuation() { assert_linked("http://example.org/a(b)", "|http://example.org/a(b)|");