diff --git a/CHANGELOG.md b/CHANGELOG.md index cf425be..a56cf1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # escrit Changelog +## 0.3.0 + +- Ignore emoji in texts + ## 0.2.2 - Updated dependencies diff --git a/Cargo.lock b/Cargo.lock index c50ef88..4142c5b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -454,7 +454,7 @@ dependencies = [ [[package]] name = "escrit" -version = "0.2.2" +version = "0.3.0" dependencies = [ "anyhow", "app_dirs2", diff --git a/Cargo.toml b/Cargo.toml index bd71d8e..349a454 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "escrit" -version = "0.2.2" +version = "0.3.0" edition = "2021" exclude = [ "texts/*", diff --git a/src/text_state.rs b/src/text_state/mod.rs similarity index 95% rename from src/text_state.rs rename to src/text_state/mod.rs index aff1459..ef8cb5b 100644 --- a/src/text_state.rs +++ b/src/text_state/mod.rs @@ -4,6 +4,9 @@ use crate::dictionary::{Dictionary, KnowledgeLevel}; use regex::Regex; +#[cfg(test)] +mod tests; + pub struct Token { pub content: String, pub selectable: bool, @@ -34,8 +37,8 @@ pub struct TextState { impl TextState { pub fn from_string(content: String) -> Self { - let seperator = - Regex::new(r####"([ ,./\-–—!\?«»":;…“”\(\)\[\]]+|[0-9]+)"####).expect("Invalid regex"); + let seperator = Regex::new(r####"([ ,./\-–—!\?«»":;…""\(\)\[\]]+|[0-9]+|[\p{Emoji_Presentation}\p{Extended_Pictographic}\u{1F000}-\u{1FFFF}]+)"####) + .expect("Invalid regex"); let paragraphs = content .split('\n') .map(|line| { diff --git a/src/text_state/tests.rs b/src/text_state/tests.rs new file mode 100644 index 0000000..6e5b845 --- /dev/null +++ b/src/text_state/tests.rs @@ -0,0 +1,101 @@ +use crate::text_state::TextState; + +#[test] +fn test_tokenization_with_emoji() { + // Test with emoji characters in different contexts + let input = "Hello 😊 world! This is a test 🚀 with emoji. Try emoji👋in words too!"; + let text_state = TextState::from_string(input.to_string()); + + let first_paragraph = &text_state.paragraphs[0]; + + // Verify that emojis are treated as separators and not as words + let selectable_tokens: Vec = first_paragraph + .iter() + .filter(|token| token.selectable) + .map(|token| token.content.clone()) + .collect(); + + // Expected selectable tokens - emojis should not be in this list + let expected_tokens = vec![ + "Hello".to_string(), + "world".to_string(), + "This".to_string(), + "is".to_string(), + "a".to_string(), + "test".to_string(), + "with".to_string(), + "emoji".to_string(), + "Try".to_string(), + "emoji".to_string(), + "in".to_string(), + "words".to_string(), + "too".to_string(), + ]; + + assert_eq!( + selectable_tokens, expected_tokens, + "Emoji characters should not be selectable tokens" + ); + + // Verify that emojis are tokenized separately + let non_selectable_tokens: Vec = first_paragraph + .iter() + .filter(|token| !token.selectable) + .map(|token| token.content.clone()) + .collect(); + + // The non-selectable tokens should include spaces, punctuation, and emojis + assert!( + non_selectable_tokens.contains(&"😊".to_string()), + "Emoji 😊 should be tokenized separately" + ); + assert!( + non_selectable_tokens.contains(&"🚀".to_string()), + "Emoji 🚀 should be tokenized separately" + ); + assert!( + non_selectable_tokens.contains(&"👋".to_string()), + "Emoji 👋 should be tokenized separately" + ); +} + +#[test] +fn test_emoji_in_middle_of_word() { + // This tests emoji handling when they appear in the middle of words + let input = "word👻word should split at emoji"; + let text_state = TextState::from_string(input.to_string()); + + let first_paragraph = &text_state.paragraphs[0]; + + // Expected selectable tokens - the word should be split by the emoji + let selectable_tokens: Vec = first_paragraph + .iter() + .filter(|token| token.selectable) + .map(|token| token.content.clone()) + .collect(); + + assert_eq!( + selectable_tokens, + vec![ + "word".to_string(), + "word".to_string(), + "should".to_string(), + "split".to_string(), + "at".to_string(), + "emoji".to_string() + ], + "Words with emoji in the middle should be split" + ); + + // Verify the emoji is separate + let non_selectable_tokens: Vec = first_paragraph + .iter() + .filter(|token| !token.selectable) + .map(|token| token.content.clone()) + .collect(); + + assert!( + non_selectable_tokens.contains(&"👻".to_string()), + "Emoji 👻 should be tokenized separately" + ); +} diff --git a/src/ui.rs b/src/ui.rs index 41c3b01..cc7e869 100644 --- a/src/ui.rs +++ b/src/ui.rs @@ -134,8 +134,8 @@ pub fn ui(f: &mut Frame, app: &mut App) { } let content = vec![ - color_by_language_level(Span::from(format!("{:?}", level)), *level), - Span::from(format!(" – {}", count)), + color_by_language_level(Span::from(format!("{level:?}")), *level), + Span::from(format!(" – {count}")), Span::from(format!(" ({:.1} %)", *count as f32 / total as f32 * 100.0)), ];