From b50f32c590d9be0aa1920b8c5a887e58e1b4583a Mon Sep 17 00:00:00 2001
From: PatrickLernerInstaffo <patrick@instaffo.com>
Date: Sat, 26 Apr 2025 17:05:50 +0200
Subject: [PATCH 1/4] Treat Emoji like stop characters

---
 CHANGELOG.md      | 4 ++++
 Cargo.lock        | 2 +-
 Cargo.toml        | 2 +-
 src/text_state.rs | 4 ++--
 4 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index cf425be..a56cf1e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # escrit Changelog
 
+## 0.3.0
+
+- Ignore emoji in texts
+
 ## 0.2.2
 
 - Updated dependencies
diff --git a/Cargo.lock b/Cargo.lock
index c50ef88..4142c5b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -454,7 +454,7 @@ dependencies = [
 
 [[package]]
 name = "escrit"
-version = "0.2.2"
+version = "0.3.0"
 dependencies = [
  "anyhow",
  "app_dirs2",
diff --git a/Cargo.toml b/Cargo.toml
index bd71d8e..349a454 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "escrit"
-version = "0.2.2"
+version = "0.3.0"
 edition = "2021"
 exclude = [
   "texts/*",
diff --git a/src/text_state.rs b/src/text_state.rs
index aff1459..5f2f559 100644
--- a/src/text_state.rs
+++ b/src/text_state.rs
@@ -34,8 +34,8 @@ pub struct TextState {
 
 impl TextState {
     pub fn from_string(content: String) -> Self {
-        let seperator =
-            Regex::new(r####"([ ,./\-–—!\?«»":;…“”\(\)\[\]]+|[0-9]+)"####).expect("Invalid regex");
+        let seperator = Regex::new(r####"([ ,./\-–—!\?«»":;…""\(\)\[\]]+|[0-9]+|[\p{Emoji_Presentation}\p{Extended_Pictographic}\u{1F000}-\u{1FFFF}]+)"####)
+            .expect("Invalid regex");
         let paragraphs = content
             .split('\n')
             .map(|line| {

From 70c85c4b4b83099d1114e8aa7e1443d56c114fba Mon Sep 17 00:00:00 2001
From: PatrickLernerInstaffo <patrick@instaffo.com>
Date: Sat, 26 Apr 2025 17:17:16 +0200
Subject: [PATCH 2/4] Add test

---
 src/{text_state.rs => text_state/mod.rs} |   3 +
 src/text_state/tests.rs                  | 104 +++++++++++++++++++++++
 2 files changed, 107 insertions(+)
 rename src/{text_state.rs => text_state/mod.rs} (99%)
 create mode 100644 src/text_state/tests.rs

diff --git a/src/text_state.rs b/src/text_state/mod.rs
similarity index 99%
rename from src/text_state.rs
rename to src/text_state/mod.rs
index 5f2f559..ef8cb5b 100644
--- a/src/text_state.rs
+++ b/src/text_state/mod.rs
@@ -4,6 +4,9 @@ use crate::dictionary::{Dictionary, KnowledgeLevel};
 
 use regex::Regex;
 
+#[cfg(test)]
+mod tests;
+
 pub struct Token {
     pub content: String,
     pub selectable: bool,
diff --git a/src/text_state/tests.rs b/src/text_state/tests.rs
new file mode 100644
index 0000000..3e95a2e
--- /dev/null
+++ b/src/text_state/tests.rs
@@ -0,0 +1,104 @@
+#[cfg(test)]
+mod tests {
+    use crate::text_state::TextState;
+
+    #[test]
+    fn test_tokenization_with_emoji() {
+        // Test with emoji characters in different contexts
+        let input = "Hello 😊 world! This is a test 🚀 with emoji. Try emoji👋in words too!";
+        let text_state = TextState::from_string(input.to_string());
+
+        let first_paragraph = &text_state.paragraphs[0];
+
+        // Verify that emojis are treated as separators and not as words
+        let selectable_tokens: Vec<String> = first_paragraph
+            .iter()
+            .filter(|token| token.selectable)
+            .map(|token| token.content.clone())
+            .collect();
+
+        // Expected selectable tokens - emojis should not be in this list
+        let expected_tokens = vec![
+            "Hello".to_string(),
+            "world".to_string(),
+            "This".to_string(),
+            "is".to_string(),
+            "a".to_string(),
+            "test".to_string(),
+            "with".to_string(),
+            "emoji".to_string(),
+            "Try".to_string(),
+            "emoji".to_string(),
+            "in".to_string(),
+            "words".to_string(),
+            "too".to_string(),
+        ];
+
+        assert_eq!(
+            selectable_tokens, expected_tokens,
+            "Emoji characters should not be selectable tokens"
+        );
+
+        // Verify that emojis are tokenized separately
+        let non_selectable_tokens: Vec<String> = first_paragraph
+            .iter()
+            .filter(|token| !token.selectable)
+            .map(|token| token.content.clone())
+            .collect();
+
+        // The non-selectable tokens should include spaces, punctuation, and emojis
+        assert!(
+            non_selectable_tokens.contains(&"😊".to_string()),
+            "Emoji 😊 should be tokenized separately"
+        );
+        assert!(
+            non_selectable_tokens.contains(&"🚀".to_string()),
+            "Emoji 🚀 should be tokenized separately"
+        );
+        assert!(
+            non_selectable_tokens.contains(&"👋".to_string()),
+            "Emoji 👋 should be tokenized separately"
+        );
+    }
+
+    #[test]
+    fn test_emoji_in_middle_of_word() {
+        // This tests emoji handling when they appear in the middle of words
+        let input = "word👻word should split at emoji";
+        let text_state = TextState::from_string(input.to_string());
+
+        let first_paragraph = &text_state.paragraphs[0];
+
+        // Expected selectable tokens - the word should be split by the emoji
+        let selectable_tokens: Vec<String> = first_paragraph
+            .iter()
+            .filter(|token| token.selectable)
+            .map(|token| token.content.clone())
+            .collect();
+
+        assert_eq!(
+            selectable_tokens,
+            vec![
+                "word".to_string(),
+                "word".to_string(),
+                "should".to_string(),
+                "split".to_string(),
+                "at".to_string(),
+                "emoji".to_string()
+            ],
+            "Words with emoji in the middle should be split"
+        );
+
+        // Verify the emoji is separate
+        let non_selectable_tokens: Vec<String> = first_paragraph
+            .iter()
+            .filter(|token| !token.selectable)
+            .map(|token| token.content.clone())
+            .collect();
+
+        assert!(
+            non_selectable_tokens.contains(&"👻".to_string()),
+            "Emoji 👻 should be tokenized separately"
+        );
+    }
+}

From db78365065ce3a3a256fa0d0ae3ceb56b5689d75 Mon Sep 17 00:00:00 2001
From: PatrickLernerInstaffo <patrick@instaffo.com>
Date: Sat, 26 Apr 2025 17:28:03 +0200
Subject: [PATCH 3/4] Fix clippy

---
 src/ui.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ui.rs b/src/ui.rs
index 41c3b01..cc7e869 100644
--- a/src/ui.rs
+++ b/src/ui.rs
@@ -134,8 +134,8 @@ pub fn ui(f: &mut Frame, app: &mut App) {
             }
 
             let content = vec![
-                color_by_language_level(Span::from(format!("{:?}", level)), *level),
-                Span::from(format!(" – {}", count)),
+                color_by_language_level(Span::from(format!("{level:?}")), *level),
+                Span::from(format!(" – {count}")),
                 Span::from(format!(" ({:.1} %)", *count as f32 / total as f32 * 100.0)),
             ];
 

From 51b5370b097d6add5a5120b70e3b72935095d6a4 Mon Sep 17 00:00:00 2001
From: PatrickLernerInstaffo <patrick@instaffo.com>
Date: Sat, 26 Apr 2025 17:29:36 +0200
Subject: [PATCH 4/4] Fix structure

---
 src/text_state/tests.rs | 177 ++++++++++++++++++++--------------------
 1 file changed, 87 insertions(+), 90 deletions(-)

diff --git a/src/text_state/tests.rs b/src/text_state/tests.rs
index 3e95a2e..6e5b845 100644
--- a/src/text_state/tests.rs
+++ b/src/text_state/tests.rs
@@ -1,104 +1,101 @@
-#[cfg(test)]
-mod tests {
-    use crate::text_state::TextState;
+use crate::text_state::TextState;
 
-    #[test]
-    fn test_tokenization_with_emoji() {
-        // Test with emoji characters in different contexts
-        let input = "Hello 😊 world! This is a test 🚀 with emoji. Try emoji👋in words too!";
-        let text_state = TextState::from_string(input.to_string());
+#[test]
+fn test_tokenization_with_emoji() {
+    // Test with emoji characters in different contexts
+    let input = "Hello 😊 world! This is a test 🚀 with emoji. Try emoji👋in words too!";
+    let text_state = TextState::from_string(input.to_string());
 
-        let first_paragraph = &text_state.paragraphs[0];
+    let first_paragraph = &text_state.paragraphs[0];
 
-        // Verify that emojis are treated as separators and not as words
-        let selectable_tokens: Vec<String> = first_paragraph
-            .iter()
-            .filter(|token| token.selectable)
-            .map(|token| token.content.clone())
-            .collect();
+    // Verify that emojis are treated as separators and not as words
+    let selectable_tokens: Vec<String> = first_paragraph
+        .iter()
+        .filter(|token| token.selectable)
+        .map(|token| token.content.clone())
+        .collect();
 
-        // Expected selectable tokens - emojis should not be in this list
-        let expected_tokens = vec![
-            "Hello".to_string(),
-            "world".to_string(),
-            "This".to_string(),
-            "is".to_string(),
-            "a".to_string(),
-            "test".to_string(),
-            "with".to_string(),
-            "emoji".to_string(),
-            "Try".to_string(),
-            "emoji".to_string(),
-            "in".to_string(),
-            "words".to_string(),
-            "too".to_string(),
-        ];
+    // Expected selectable tokens - emojis should not be in this list
+    let expected_tokens = vec![
+        "Hello".to_string(),
+        "world".to_string(),
+        "This".to_string(),
+        "is".to_string(),
+        "a".to_string(),
+        "test".to_string(),
+        "with".to_string(),
+        "emoji".to_string(),
+        "Try".to_string(),
+        "emoji".to_string(),
+        "in".to_string(),
+        "words".to_string(),
+        "too".to_string(),
+    ];
 
-        assert_eq!(
-            selectable_tokens, expected_tokens,
-            "Emoji characters should not be selectable tokens"
-        );
+    assert_eq!(
+        selectable_tokens, expected_tokens,
+        "Emoji characters should not be selectable tokens"
+    );
 
-        // Verify that emojis are tokenized separately
-        let non_selectable_tokens: Vec<String> = first_paragraph
-            .iter()
-            .filter(|token| !token.selectable)
-            .map(|token| token.content.clone())
-            .collect();
+    // Verify that emojis are tokenized separately
+    let non_selectable_tokens: Vec<String> = first_paragraph
+        .iter()
+        .filter(|token| !token.selectable)
+        .map(|token| token.content.clone())
+        .collect();
 
-        // The non-selectable tokens should include spaces, punctuation, and emojis
-        assert!(
-            non_selectable_tokens.contains(&"😊".to_string()),
-            "Emoji 😊 should be tokenized separately"
-        );
-        assert!(
-            non_selectable_tokens.contains(&"🚀".to_string()),
-            "Emoji 🚀 should be tokenized separately"
-        );
-        assert!(
-            non_selectable_tokens.contains(&"👋".to_string()),
-            "Emoji 👋 should be tokenized separately"
-        );
-    }
+    // The non-selectable tokens should include spaces, punctuation, and emojis
+    assert!(
+        non_selectable_tokens.contains(&"😊".to_string()),
+        "Emoji 😊 should be tokenized separately"
+    );
+    assert!(
+        non_selectable_tokens.contains(&"🚀".to_string()),
+        "Emoji 🚀 should be tokenized separately"
+    );
+    assert!(
+        non_selectable_tokens.contains(&"👋".to_string()),
+        "Emoji 👋 should be tokenized separately"
+    );
+}
 
-    #[test]
-    fn test_emoji_in_middle_of_word() {
-        // This tests emoji handling when they appear in the middle of words
-        let input = "word👻word should split at emoji";
-        let text_state = TextState::from_string(input.to_string());
+#[test]
+fn test_emoji_in_middle_of_word() {
+    // This tests emoji handling when they appear in the middle of words
+    let input = "word👻word should split at emoji";
+    let text_state = TextState::from_string(input.to_string());
 
-        let first_paragraph = &text_state.paragraphs[0];
+    let first_paragraph = &text_state.paragraphs[0];
 
-        // Expected selectable tokens - the word should be split by the emoji
-        let selectable_tokens: Vec<String> = first_paragraph
-            .iter()
-            .filter(|token| token.selectable)
-            .map(|token| token.content.clone())
-            .collect();
+    // Expected selectable tokens - the word should be split by the emoji
+    let selectable_tokens: Vec<String> = first_paragraph
+        .iter()
+        .filter(|token| token.selectable)
+        .map(|token| token.content.clone())
+        .collect();
 
-        assert_eq!(
-            selectable_tokens,
-            vec![
-                "word".to_string(),
-                "word".to_string(),
-                "should".to_string(),
-                "split".to_string(),
-                "at".to_string(),
-                "emoji".to_string()
-            ],
-            "Words with emoji in the middle should be split"
-        );
+    assert_eq!(
+        selectable_tokens,
+        vec![
+            "word".to_string(),
+            "word".to_string(),
+            "should".to_string(),
+            "split".to_string(),
+            "at".to_string(),
+            "emoji".to_string()
+        ],
+        "Words with emoji in the middle should be split"
+    );
 
-        // Verify the emoji is separate
-        let non_selectable_tokens: Vec<String> = first_paragraph
-            .iter()
-            .filter(|token| !token.selectable)
-            .map(|token| token.content.clone())
-            .collect();
+    // Verify the emoji is separate
+    let non_selectable_tokens: Vec<String> = first_paragraph
+        .iter()
+        .filter(|token| !token.selectable)
+        .map(|token| token.content.clone())
+        .collect();
 
-        assert!(
-            non_selectable_tokens.contains(&"👻".to_string()),
-            "Emoji 👻 should be tokenized separately"
-        );
-    }
+    assert!(
+        non_selectable_tokens.contains(&"👻".to_string()),
+        "Emoji 👻 should be tokenized separately"
+    );
 }