From b50f32c590d9be0aa1920b8c5a887e58e1b4583a Mon Sep 17 00:00:00 2001 From: PatrickLernerInstaffo Date: Sat, 26 Apr 2025 17:05:50 +0200 Subject: [PATCH 1/4] Treat Emoji like stop characters --- CHANGELOG.md | 4 ++++ Cargo.lock | 2 +- Cargo.toml | 2 +- src/text_state.rs | 4 ++-- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cf425be..a56cf1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # escrit Changelog +## 0.3.0 + +- Ignore emoji in texts + ## 0.2.2 - Updated dependencies diff --git a/Cargo.lock b/Cargo.lock index c50ef88..4142c5b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -454,7 +454,7 @@ dependencies = [ [[package]] name = "escrit" -version = "0.2.2" +version = "0.3.0" dependencies = [ "anyhow", "app_dirs2", diff --git a/Cargo.toml b/Cargo.toml index bd71d8e..349a454 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "escrit" -version = "0.2.2" +version = "0.3.0" edition = "2021" exclude = [ "texts/*", diff --git a/src/text_state.rs b/src/text_state.rs index aff1459..5f2f559 100644 --- a/src/text_state.rs +++ b/src/text_state.rs @@ -34,8 +34,8 @@ pub struct TextState { impl TextState { pub fn from_string(content: String) -> Self { - let seperator = - Regex::new(r####"([ ,./\-–—!\?«»":;…“”\(\)\[\]]+|[0-9]+)"####).expect("Invalid regex"); + let seperator = Regex::new(r####"([ ,./\-–—!\?«»":;…""\(\)\[\]]+|[0-9]+|[\p{Emoji_Presentation}\p{Extended_Pictographic}\u{1F000}-\u{1FFFF}]+)"####) + .expect("Invalid regex"); let paragraphs = content .split('\n') .map(|line| { From 70c85c4b4b83099d1114e8aa7e1443d56c114fba Mon Sep 17 00:00:00 2001 From: PatrickLernerInstaffo Date: Sat, 26 Apr 2025 17:17:16 +0200 Subject: [PATCH 2/4] Add test --- src/{text_state.rs => text_state/mod.rs} | 3 + src/text_state/tests.rs | 104 +++++++++++++++++++++++ 2 files changed, 107 insertions(+) rename src/{text_state.rs => text_state/mod.rs} (99%) create mode 100644 src/text_state/tests.rs diff --git a/src/text_state.rs b/src/text_state/mod.rs similarity index 99% rename from src/text_state.rs rename to src/text_state/mod.rs index 5f2f559..ef8cb5b 100644 --- a/src/text_state.rs +++ b/src/text_state/mod.rs @@ -4,6 +4,9 @@ use crate::dictionary::{Dictionary, KnowledgeLevel}; use regex::Regex; +#[cfg(test)] +mod tests; + pub struct Token { pub content: String, pub selectable: bool, diff --git a/src/text_state/tests.rs b/src/text_state/tests.rs new file mode 100644 index 0000000..3e95a2e --- /dev/null +++ b/src/text_state/tests.rs @@ -0,0 +1,104 @@ +#[cfg(test)] +mod tests { + use crate::text_state::TextState; + + #[test] + fn test_tokenization_with_emoji() { + // Test with emoji characters in different contexts + let input = "Hello 😊 world! This is a test 🚀 with emoji. Try emoji👋in words too!"; + let text_state = TextState::from_string(input.to_string()); + + let first_paragraph = &text_state.paragraphs[0]; + + // Verify that emojis are treated as separators and not as words + let selectable_tokens: Vec = first_paragraph + .iter() + .filter(|token| token.selectable) + .map(|token| token.content.clone()) + .collect(); + + // Expected selectable tokens - emojis should not be in this list + let expected_tokens = vec![ + "Hello".to_string(), + "world".to_string(), + "This".to_string(), + "is".to_string(), + "a".to_string(), + "test".to_string(), + "with".to_string(), + "emoji".to_string(), + "Try".to_string(), + "emoji".to_string(), + "in".to_string(), + "words".to_string(), + "too".to_string(), + ]; + + assert_eq!( + selectable_tokens, expected_tokens, + "Emoji characters should not be selectable tokens" + ); + + // Verify that emojis are tokenized separately + let non_selectable_tokens: Vec = first_paragraph + .iter() + .filter(|token| !token.selectable) + .map(|token| token.content.clone()) + .collect(); + + // The non-selectable tokens should include spaces, punctuation, and emojis + assert!( + non_selectable_tokens.contains(&"😊".to_string()), + "Emoji 😊 should be tokenized separately" + ); + assert!( + non_selectable_tokens.contains(&"🚀".to_string()), + "Emoji 🚀 should be tokenized separately" + ); + assert!( + non_selectable_tokens.contains(&"👋".to_string()), + "Emoji 👋 should be tokenized separately" + ); + } + + #[test] + fn test_emoji_in_middle_of_word() { + // This tests emoji handling when they appear in the middle of words + let input = "word👻word should split at emoji"; + let text_state = TextState::from_string(input.to_string()); + + let first_paragraph = &text_state.paragraphs[0]; + + // Expected selectable tokens - the word should be split by the emoji + let selectable_tokens: Vec = first_paragraph + .iter() + .filter(|token| token.selectable) + .map(|token| token.content.clone()) + .collect(); + + assert_eq!( + selectable_tokens, + vec![ + "word".to_string(), + "word".to_string(), + "should".to_string(), + "split".to_string(), + "at".to_string(), + "emoji".to_string() + ], + "Words with emoji in the middle should be split" + ); + + // Verify the emoji is separate + let non_selectable_tokens: Vec = first_paragraph + .iter() + .filter(|token| !token.selectable) + .map(|token| token.content.clone()) + .collect(); + + assert!( + non_selectable_tokens.contains(&"👻".to_string()), + "Emoji 👻 should be tokenized separately" + ); + } +} From db78365065ce3a3a256fa0d0ae3ceb56b5689d75 Mon Sep 17 00:00:00 2001 From: PatrickLernerInstaffo Date: Sat, 26 Apr 2025 17:28:03 +0200 Subject: [PATCH 3/4] Fix clippy --- src/ui.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ui.rs b/src/ui.rs index 41c3b01..cc7e869 100644 --- a/src/ui.rs +++ b/src/ui.rs @@ -134,8 +134,8 @@ pub fn ui(f: &mut Frame, app: &mut App) { } let content = vec![ - color_by_language_level(Span::from(format!("{:?}", level)), *level), - Span::from(format!(" – {}", count)), + color_by_language_level(Span::from(format!("{level:?}")), *level), + Span::from(format!(" – {count}")), Span::from(format!(" ({:.1} %)", *count as f32 / total as f32 * 100.0)), ]; From 51b5370b097d6add5a5120b70e3b72935095d6a4 Mon Sep 17 00:00:00 2001 From: PatrickLernerInstaffo Date: Sat, 26 Apr 2025 17:29:36 +0200 Subject: [PATCH 4/4] Fix structure --- src/text_state/tests.rs | 177 ++++++++++++++++++++-------------------- 1 file changed, 87 insertions(+), 90 deletions(-) diff --git a/src/text_state/tests.rs b/src/text_state/tests.rs index 3e95a2e..6e5b845 100644 --- a/src/text_state/tests.rs +++ b/src/text_state/tests.rs @@ -1,104 +1,101 @@ -#[cfg(test)] -mod tests { - use crate::text_state::TextState; +use crate::text_state::TextState; - #[test] - fn test_tokenization_with_emoji() { - // Test with emoji characters in different contexts - let input = "Hello 😊 world! This is a test 🚀 with emoji. Try emoji👋in words too!"; - let text_state = TextState::from_string(input.to_string()); +#[test] +fn test_tokenization_with_emoji() { + // Test with emoji characters in different contexts + let input = "Hello 😊 world! This is a test 🚀 with emoji. Try emoji👋in words too!"; + let text_state = TextState::from_string(input.to_string()); - let first_paragraph = &text_state.paragraphs[0]; + let first_paragraph = &text_state.paragraphs[0]; - // Verify that emojis are treated as separators and not as words - let selectable_tokens: Vec = first_paragraph - .iter() - .filter(|token| token.selectable) - .map(|token| token.content.clone()) - .collect(); + // Verify that emojis are treated as separators and not as words + let selectable_tokens: Vec = first_paragraph + .iter() + .filter(|token| token.selectable) + .map(|token| token.content.clone()) + .collect(); - // Expected selectable tokens - emojis should not be in this list - let expected_tokens = vec![ - "Hello".to_string(), - "world".to_string(), - "This".to_string(), - "is".to_string(), - "a".to_string(), - "test".to_string(), - "with".to_string(), - "emoji".to_string(), - "Try".to_string(), - "emoji".to_string(), - "in".to_string(), - "words".to_string(), - "too".to_string(), - ]; + // Expected selectable tokens - emojis should not be in this list + let expected_tokens = vec![ + "Hello".to_string(), + "world".to_string(), + "This".to_string(), + "is".to_string(), + "a".to_string(), + "test".to_string(), + "with".to_string(), + "emoji".to_string(), + "Try".to_string(), + "emoji".to_string(), + "in".to_string(), + "words".to_string(), + "too".to_string(), + ]; - assert_eq!( - selectable_tokens, expected_tokens, - "Emoji characters should not be selectable tokens" - ); + assert_eq!( + selectable_tokens, expected_tokens, + "Emoji characters should not be selectable tokens" + ); - // Verify that emojis are tokenized separately - let non_selectable_tokens: Vec = first_paragraph - .iter() - .filter(|token| !token.selectable) - .map(|token| token.content.clone()) - .collect(); + // Verify that emojis are tokenized separately + let non_selectable_tokens: Vec = first_paragraph + .iter() + .filter(|token| !token.selectable) + .map(|token| token.content.clone()) + .collect(); - // The non-selectable tokens should include spaces, punctuation, and emojis - assert!( - non_selectable_tokens.contains(&"😊".to_string()), - "Emoji 😊 should be tokenized separately" - ); - assert!( - non_selectable_tokens.contains(&"🚀".to_string()), - "Emoji 🚀 should be tokenized separately" - ); - assert!( - non_selectable_tokens.contains(&"👋".to_string()), - "Emoji 👋 should be tokenized separately" - ); - } + // The non-selectable tokens should include spaces, punctuation, and emojis + assert!( + non_selectable_tokens.contains(&"😊".to_string()), + "Emoji 😊 should be tokenized separately" + ); + assert!( + non_selectable_tokens.contains(&"🚀".to_string()), + "Emoji 🚀 should be tokenized separately" + ); + assert!( + non_selectable_tokens.contains(&"👋".to_string()), + "Emoji 👋 should be tokenized separately" + ); +} - #[test] - fn test_emoji_in_middle_of_word() { - // This tests emoji handling when they appear in the middle of words - let input = "word👻word should split at emoji"; - let text_state = TextState::from_string(input.to_string()); +#[test] +fn test_emoji_in_middle_of_word() { + // This tests emoji handling when they appear in the middle of words + let input = "word👻word should split at emoji"; + let text_state = TextState::from_string(input.to_string()); - let first_paragraph = &text_state.paragraphs[0]; + let first_paragraph = &text_state.paragraphs[0]; - // Expected selectable tokens - the word should be split by the emoji - let selectable_tokens: Vec = first_paragraph - .iter() - .filter(|token| token.selectable) - .map(|token| token.content.clone()) - .collect(); + // Expected selectable tokens - the word should be split by the emoji + let selectable_tokens: Vec = first_paragraph + .iter() + .filter(|token| token.selectable) + .map(|token| token.content.clone()) + .collect(); - assert_eq!( - selectable_tokens, - vec![ - "word".to_string(), - "word".to_string(), - "should".to_string(), - "split".to_string(), - "at".to_string(), - "emoji".to_string() - ], - "Words with emoji in the middle should be split" - ); + assert_eq!( + selectable_tokens, + vec![ + "word".to_string(), + "word".to_string(), + "should".to_string(), + "split".to_string(), + "at".to_string(), + "emoji".to_string() + ], + "Words with emoji in the middle should be split" + ); - // Verify the emoji is separate - let non_selectable_tokens: Vec = first_paragraph - .iter() - .filter(|token| !token.selectable) - .map(|token| token.content.clone()) - .collect(); + // Verify the emoji is separate + let non_selectable_tokens: Vec = first_paragraph + .iter() + .filter(|token| !token.selectable) + .map(|token| token.content.clone()) + .collect(); - assert!( - non_selectable_tokens.contains(&"👻".to_string()), - "Emoji 👻 should be tokenized separately" - ); - } + assert!( + non_selectable_tokens.contains(&"👻".to_string()), + "Emoji 👻 should be tokenized separately" + ); }