PatrickLerner · PatrickLerner · Apr 26, 2025 · Apr 26, 2025 · Apr 26, 2025 · Apr 26, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # escrit Changelog
 
+## 0.3.0
+
+- Ignore emoji in texts
+
 ## 0.2.2
 
 - Updated dependencies

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "escrit"
-version = "0.2.2"
+version = "0.3.0"
 edition = "2021"
 exclude = [
   "texts/*",

diff --git a/src/text_state.rs → src/text_state/mod.rs b/src/text_state.rs → src/text_state/mod.rs
@@ -4,6 +4,9 @@ use crate::dictionary::{Dictionary, KnowledgeLevel};
 
 use regex::Regex;
 
+#[cfg(test)]
+mod tests;
+
 pub struct Token {
     pub content: String,
     pub selectable: bool,
@@ -34,8 +37,8 @@ pub struct TextState {
 
 impl TextState {
     pub fn from_string(content: String) -> Self {
-        let seperator =
-            Regex::new(r####"([ ,./\-–—!\?«»":;…“”\(\)\[\]]+|[0-9]+)"####).expect("Invalid regex");
+        let seperator = Regex::new(r####"([ ,./\-–—!\?«»":;…""\(\)\[\]]+|[0-9]+|[\p{Emoji_Presentation}\p{Extended_Pictographic}\u{1F000}-\u{1FFFF}]+)"####)
+            .expect("Invalid regex");
         let paragraphs = content
             .split('\n')
             .map(|line| {

diff --git a/src/text_state/tests.rs b/src/text_state/tests.rs
@@ -0,0 +1,101 @@
+use crate::text_state::TextState;
+
+#[test]
+fn test_tokenization_with_emoji() {
+    // Test with emoji characters in different contexts
+    let input = "Hello 😊 world! This is a test 🚀 with emoji. Try emoji👋in words too!";
+    let text_state = TextState::from_string(input.to_string());
+
+    let first_paragraph = &text_state.paragraphs[0];
+
+    // Verify that emojis are treated as separators and not as words
+    let selectable_tokens: Vec<String> = first_paragraph
+        .iter()
+        .filter(|token| token.selectable)
+        .map(|token| token.content.clone())
+        .collect();
+
+    // Expected selectable tokens - emojis should not be in this list
+    let expected_tokens = vec![
+        "Hello".to_string(),
+        "world".to_string(),
+        "This".to_string(),
+        "is".to_string(),
+        "a".to_string(),
+        "test".to_string(),
+        "with".to_string(),
+        "emoji".to_string(),
+        "Try".to_string(),
+        "emoji".to_string(),
+        "in".to_string(),
+        "words".to_string(),
+        "too".to_string(),
+    ];
+
+    assert_eq!(
+        selectable_tokens, expected_tokens,
+        "Emoji characters should not be selectable tokens"
+    );
+
+    // Verify that emojis are tokenized separately
+    let non_selectable_tokens: Vec<String> = first_paragraph
+        .iter()
+        .filter(|token| !token.selectable)
+        .map(|token| token.content.clone())
+        .collect();
+
+    // The non-selectable tokens should include spaces, punctuation, and emojis
+    assert!(
+        non_selectable_tokens.contains(&"😊".to_string()),
+        "Emoji 😊 should be tokenized separately"
+    );
+    assert!(
+        non_selectable_tokens.contains(&"🚀".to_string()),
+        "Emoji 🚀 should be tokenized separately"
+    );
+    assert!(
+        non_selectable_tokens.contains(&"👋".to_string()),
+        "Emoji 👋 should be tokenized separately"
+    );
+}
+
+#[test]
+fn test_emoji_in_middle_of_word() {
+    // This tests emoji handling when they appear in the middle of words
+    let input = "word👻word should split at emoji";
+    let text_state = TextState::from_string(input.to_string());
+
+    let first_paragraph = &text_state.paragraphs[0];
+
+    // Expected selectable tokens - the word should be split by the emoji
+    let selectable_tokens: Vec<String> = first_paragraph
+        .iter()
+        .filter(|token| token.selectable)
+        .map(|token| token.content.clone())
+        .collect();
+
+    assert_eq!(
+        selectable_tokens,
+        vec![
+            "word".to_string(),
+            "word".to_string(),
+            "should".to_string(),
+            "split".to_string(),
+            "at".to_string(),
+            "emoji".to_string()
+        ],
+        "Words with emoji in the middle should be split"
+    );
+
+    // Verify the emoji is separate
+    let non_selectable_tokens: Vec<String> = first_paragraph
+        .iter()
+        .filter(|token| !token.selectable)
+        .map(|token| token.content.clone())
+        .collect();
+
+    assert!(
+        non_selectable_tokens.contains(&"👻".to_string()),
+        "Emoji 👻 should be tokenized separately"
+    );
+}
diff --git a/src/ui.rs b/src/ui.rs
@@ -134,8 +134,8 @@
             }
 
             let content = vec![
-                color_by_language_level(Span::from(format!("{:?}", level)), *level),
-                Span::from(format!(" – {}", count)),
+                color_by_language_level(Span::from(format!("{level:?}")), *level),
+                Span::from(format!(" – {count}")),
                 Span::from(format!(" ({:.1} %)", *count as f32 / total as f32 * 100.0)),
             ];