Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# escrit Changelog

## 0.3.0

- Ignore emoji in texts

## 0.2.2

- Updated dependencies
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "escrit"
version = "0.2.2"
version = "0.3.0"
edition = "2021"
exclude = [
"texts/*",
Expand Down
7 changes: 5 additions & 2 deletions src/text_state.rs → src/text_state/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ use crate::dictionary::{Dictionary, KnowledgeLevel};

use regex::Regex;

#[cfg(test)]
mod tests;

pub struct Token {
pub content: String,
pub selectable: bool,
Expand Down Expand Up @@ -34,8 +37,8 @@ pub struct TextState {

impl TextState {
pub fn from_string(content: String) -> Self {
let seperator =
Regex::new(r####"([ ,./\-–—!\?«»":;…“”\(\)\[\]]+|[0-9]+)"####).expect("Invalid regex");
let seperator = Regex::new(r####"([ ,./\-–—!\?«»":;…""\(\)\[\]]+|[0-9]+|[\p{Emoji_Presentation}\p{Extended_Pictographic}\u{1F000}-\u{1FFFF}]+)"####)
.expect("Invalid regex");
let paragraphs = content
.split('\n')
.map(|line| {
Expand Down
101 changes: 101 additions & 0 deletions src/text_state/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
use crate::text_state::TextState;

#[test]
fn test_tokenization_with_emoji() {
// Test with emoji characters in different contexts
let input = "Hello 😊 world! This is a test 🚀 with emoji. Try emoji👋in words too!";
let text_state = TextState::from_string(input.to_string());

let first_paragraph = &text_state.paragraphs[0];

// Verify that emojis are treated as separators and not as words
let selectable_tokens: Vec<String> = first_paragraph
.iter()
.filter(|token| token.selectable)
.map(|token| token.content.clone())
.collect();

// Expected selectable tokens - emojis should not be in this list
let expected_tokens = vec![
"Hello".to_string(),
"world".to_string(),
"This".to_string(),
"is".to_string(),
"a".to_string(),
"test".to_string(),
"with".to_string(),
"emoji".to_string(),
"Try".to_string(),
"emoji".to_string(),
"in".to_string(),
"words".to_string(),
"too".to_string(),
];

assert_eq!(
selectable_tokens, expected_tokens,
"Emoji characters should not be selectable tokens"

Check warning on line 37 in src/text_state/tests.rs

View check run for this annotation

Codecov / codecov/patch

src/text_state/tests.rs#L37

Added line #L37 was not covered by tests
);

// Verify that emojis are tokenized separately
let non_selectable_tokens: Vec<String> = first_paragraph
.iter()
.filter(|token| !token.selectable)
.map(|token| token.content.clone())
.collect();

// The non-selectable tokens should include spaces, punctuation, and emojis
assert!(
non_selectable_tokens.contains(&"😊".to_string()),
"Emoji 😊 should be tokenized separately"

Check warning on line 50 in src/text_state/tests.rs

View check run for this annotation

Codecov / codecov/patch

src/text_state/tests.rs#L50

Added line #L50 was not covered by tests
);
assert!(
non_selectable_tokens.contains(&"🚀".to_string()),
"Emoji 🚀 should be tokenized separately"

Check warning on line 54 in src/text_state/tests.rs

View check run for this annotation

Codecov / codecov/patch

src/text_state/tests.rs#L54

Added line #L54 was not covered by tests
);
assert!(
non_selectable_tokens.contains(&"👋".to_string()),
"Emoji 👋 should be tokenized separately"

Check warning on line 58 in src/text_state/tests.rs

View check run for this annotation

Codecov / codecov/patch

src/text_state/tests.rs#L58

Added line #L58 was not covered by tests
);
}

#[test]
fn test_emoji_in_middle_of_word() {
// This tests emoji handling when they appear in the middle of words
let input = "word👻word should split at emoji";
let text_state = TextState::from_string(input.to_string());

let first_paragraph = &text_state.paragraphs[0];

// Expected selectable tokens - the word should be split by the emoji
let selectable_tokens: Vec<String> = first_paragraph
.iter()
.filter(|token| token.selectable)
.map(|token| token.content.clone())
.collect();

assert_eq!(
selectable_tokens,
vec![
"word".to_string(),
"word".to_string(),
"should".to_string(),
"split".to_string(),
"at".to_string(),
"emoji".to_string()
],
"Words with emoji in the middle should be split"

Check warning on line 87 in src/text_state/tests.rs

View check run for this annotation

Codecov / codecov/patch

src/text_state/tests.rs#L87

Added line #L87 was not covered by tests
);

// Verify the emoji is separate
let non_selectable_tokens: Vec<String> = first_paragraph
.iter()
.filter(|token| !token.selectable)
.map(|token| token.content.clone())
.collect();

assert!(
non_selectable_tokens.contains(&"👻".to_string()),
"Emoji 👻 should be tokenized separately"

Check warning on line 99 in src/text_state/tests.rs

View check run for this annotation

Codecov / codecov/patch

src/text_state/tests.rs#L99

Added line #L99 was not covered by tests
);
}
4 changes: 2 additions & 2 deletions src/ui.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@
}

let content = vec![
color_by_language_level(Span::from(format!("{:?}", level)), *level),
Span::from(format!(" – {}", count)),
color_by_language_level(Span::from(format!("{level:?}")), *level),
Span::from(format!(" – {count}")),

Check warning on line 138 in src/ui.rs

View check run for this annotation

Codecov / codecov/patch

src/ui.rs#L137-L138

Added lines #L137 - L138 were not covered by tests
Span::from(format!(" ({:.1} %)", *count as f32 / total as f32 * 100.0)),
];

Expand Down
Loading