diff --git a/app/main.py b/app/main.py index 5639639..e04abfa 100644 --- a/app/main.py +++ b/app/main.py @@ -7,4 +7,5 @@ # print(docs_parser.get_text("parser/assets/text_from_img.png")) # print(docs_parser.get_text("parser/assets/main.typ")) # print(docs_parser.get_text("parser/assets/main.pdf")) -print(docs_parser.get_text("parser/assets/too_many_png.docx")) +# print(docs_parser.get_text("parser/assets/too_many_png.docx")) +print(docs_parser.get_text("parser/assets/Presentation.pptx")) diff --git a/parser/Cargo.lock b/parser/Cargo.lock index a520a46..7509547 100644 --- a/parser/Cargo.lock +++ b/parser/Cargo.lock @@ -55,6 +55,56 @@ dependencies = [ "equator", ] +[[package]] +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + [[package]] name = "anyhow" version = "1.0.102" @@ -66,6 +116,9 @@ name = "arbitrary" version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" +dependencies = [ + "derive_arbitrary", +] [[package]] name = "arg_enum_proc_macro" @@ -251,6 +304,15 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" +[[package]] +name = "bzip2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" +dependencies = [ + "bzip2-sys", +] + [[package]] name = "bzip2" version = "0.6.1" @@ -260,6 +322,16 @@ dependencies = [ "libbz2-rs-sys", ] +[[package]] +name = "bzip2-sys" +version = "0.1.13+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "cbc" version = "0.1.2" @@ -334,12 +406,64 @@ dependencies = [ "libloading", ] +[[package]] +name = "clap" +version = "4.5.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.116", +] + +[[package]] +name = "clap_lex" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" + [[package]] name = "color_quant" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + [[package]] name = "constant_time_eq" version = "0.4.2" @@ -364,6 +488,21 @@ dependencies = [ "libc", ] +[[package]] +name = "crc" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + [[package]] name = "crc32fast" version = "1.5.0" @@ -429,6 +568,17 @@ dependencies = [ "powerfmt", ] +[[package]] +name = "derive_arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.116", +] + [[package]] name = "digest" version = "0.10.7" @@ -440,6 +590,17 @@ dependencies = [ "subtle", ] +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.116", +] + [[package]] name = "docx-rs" version = "0.4.19" @@ -479,6 +640,29 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "env_filter" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a1c3cc8e57274ec99de65301228b537f1e4eedc1b8e0f9411c6caac8ae7308f" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2daee4ea451f429a58296525ddf28b45a3b64f1acf6587e2067437bb11e218d" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "jiff", + "log", +] + [[package]] name = "equator" version = "0.4.2" @@ -614,9 +798,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", + "js-sys", "libc", "r-efi", "wasip2", + "wasm-bindgen", ] [[package]] @@ -788,6 +974,12 @@ dependencies = [ "syn 2.0.116", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + [[package]] name = "itertools" version = "0.14.0" @@ -803,6 +995,30 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +[[package]] +name = "jiff" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359" +dependencies = [ + "jiff-static", + "log", + "portable-atomic", + "portable-atomic-util", + "serde_core", +] + +[[package]] +name = "jiff-static" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.116", +] + [[package]] name = "jobserver" version = "0.1.34" @@ -950,6 +1166,16 @@ dependencies = [ "weezl", ] +[[package]] +name = "lzma-rs" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e" +dependencies = [ + "byteorder", + "crc", +] + [[package]] name = "lzma-rust2" version = "0.16.2" @@ -959,6 +1185,17 @@ dependencies = [ "sha2", ] +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "maybe-rayon" version = "0.1.1" @@ -1121,6 +1358,12 @@ version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + [[package]] name = "parser" version = "0.1.0" @@ -1131,8 +1374,9 @@ dependencies = [ "mime", "pdf-extract", "pyo3", - "quick-xml", + "quick-xml 0.39.2", "rayon", + "rustypptx", "tesseract", "thiserror 2.0.18", "zip 8.1.0", @@ -1214,6 +1458,15 @@ version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" +[[package]] +name = "portable-atomic-util" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a9db96d7fa8782dd8c15ce32ffe8680bbd1e978a43bf51a34d39483540495f5" +dependencies = [ + "portable-atomic", +] + [[package]] name = "postscript" version = "0.14.1" @@ -1361,6 +1614,15 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" +[[package]] +name = "quick-xml" +version = "0.37.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" +dependencies = [ + "memchr", +] + [[package]] name = "quick-xml" version = "0.39.2" @@ -1550,6 +1812,23 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "rustypptx" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f1265abd01a89e4f51fb62d3db2be3eb31004023b3957e3a482cfbc26d23509" +dependencies = [ + "clap", + "env_logger", + "log", + "quick-xml 0.37.5", + "rayon", + "serde", + "serde_json", + "thiserror 2.0.18", + "zip 2.4.2", +] + [[package]] name = "semver" version = "1.0.27" @@ -1665,6 +1944,12 @@ dependencies = [ "unicode-properties", ] +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "subtle" version = "2.6.1" @@ -1882,6 +2167,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "uuid" version = "1.21.0" @@ -2212,6 +2503,15 @@ version = "0.8.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3ae8337f8a065cfc972643663ea4279e04e7256de865aa66fe25cec5fb912d3f" +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "y4m" version = "0.8.0" @@ -2270,6 +2570,36 @@ dependencies = [ "flate2", ] +[[package]] +name = "zip" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50" +dependencies = [ + "aes", + "arbitrary", + "bzip2 0.5.2", + "constant_time_eq 0.3.1", + "crc32fast", + "crossbeam-utils", + "deflate64", + "displaydoc", + "flate2", + "getrandom 0.3.4", + "hmac", + "indexmap", + "lzma-rs", + "memchr", + "pbkdf2", + "sha1", + "thiserror 2.0.18", + "time", + "xz2", + "zeroize", + "zopfli", + "zstd", +] + [[package]] name = "zip" version = "8.1.0" @@ -2277,8 +2607,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e499faf5c6b97a0d086f4a8733de6d47aee2252b8127962439d8d4311a73f72" dependencies = [ "aes", - "bzip2", - "constant_time_eq", + "bzip2 0.6.1", + "constant_time_eq 0.4.2", "crc32fast", "deflate64", "flate2", diff --git a/parser/Cargo.toml b/parser/Cargo.toml index 0c35600..6569198 100644 --- a/parser/Cargo.toml +++ b/parser/Cargo.toml @@ -20,6 +20,7 @@ mime = "0.3.17" # NOTE: Для парсинга форматов офиса docx-rs = "0.4.19" +rustypptx = "0.2.0" zip = "8.1.0" quick-xml = "0.39.2" diff --git a/parser/assets/pres_with_png.pptx b/parser/assets/pres_with_png.pptx new file mode 100644 index 0000000..84e9949 Binary files /dev/null and b/parser/assets/pres_with_png.pptx differ diff --git a/parser/assets/pres_without_png.pptx b/parser/assets/pres_without_png.pptx new file mode 100644 index 0000000..923a008 Binary files /dev/null and b/parser/assets/pres_without_png.pptx differ diff --git a/parser/assets/tests_results/extract_text_from_pptx_with_png.txt b/parser/assets/tests_results/extract_text_from_pptx_with_png.txt new file mode 100644 index 0000000..233fd74 --- /dev/null +++ b/parser/assets/tests_results/extract_text_from_pptx_with_png.txt @@ -0,0 +1,15 @@ + +/*****************slide = 1 ***************/ + Тема презентации +Подзаголовок презентации + + + +/*****************slide = 2 ***************/ + Текст заголовка со слайда +Абиба + + +/********slide = 2; img_num = 0********/ +МЯУ=191919 +/*****************************************************/ \ No newline at end of file diff --git a/parser/assets/tests_results/extract_text_from_pptx_without_png.txt b/parser/assets/tests_results/extract_text_from_pptx_without_png.txt new file mode 100644 index 0000000..397c389 --- /dev/null +++ b/parser/assets/tests_results/extract_text_from_pptx_without_png.txt @@ -0,0 +1,11 @@ + +/*****************slide = 1 ***************/ + Тема презентации +Подзаголовок презентации + + + +/*****************slide = 2 ***************/ + Текст заголовка со слайда +Абиба + diff --git a/parser/src/errors.rs b/parser/src/errors.rs index f83b5b6..bb0a2e9 100644 --- a/parser/src/errors.rs +++ b/parser/src/errors.rs @@ -55,6 +55,12 @@ pub enum ParserError { #[error("Docx error: {0}")] DocxError(#[from] docx_rs::ReaderError), + /// Ошибка чтения pptx + /// + /// Ошибки библиотеки для работы с pptx + #[error("Docx error: {0}")] + PptxError(#[from] rustypptx::PptxError), + /// Ошибка tesseract::InitializeError #[error("Tesseract init error: {0}")] TesseractInitError(#[from] tesseract::InitializeError), diff --git a/parser/src/lib.rs b/parser/src/lib.rs index bcf1b0f..6bedfa3 100644 --- a/parser/src/lib.rs +++ b/parser/src/lib.rs @@ -6,7 +6,6 @@ mod parsers; use pyo3::prelude::*; use pyo3::{PyResult, types::PyModule}; - /// Модуль для реализации функций модуля `docs_parser` mod parser { use pyo3::prelude::*; diff --git a/parser/src/match_parsers.rs b/parser/src/match_parsers.rs index 9a48a25..230238a 100644 --- a/parser/src/match_parsers.rs +++ b/parser/src/match_parsers.rs @@ -11,7 +11,7 @@ use crate::{ APPLICATION_XLS, APPLICATION_XLSX, }, errors::ParserError, - parsers::{docx, image::get_from_image, pdf::get_from_pdf, text::get_from_text}, + parsers::{docx, image::get_from_image, pdf::get_from_pdf, pptx, text::get_from_text}, }; type Result = std::result::Result; @@ -41,7 +41,11 @@ pub fn get_text(file_name: &str) -> Result { docx_parser.get_from_docx(&file_data) } Some(mime) if mime == APPLICATION_XLSX => todo!(), - Some(mime) if mime == APPLICATION_PPTX => todo!(), + Some(mime) if mime == APPLICATION_PPTX => { + let pptx_parser = pptx::PptxParser::new(); + let (res, _) = pptx_parser.get_from_pptx(&file_data)?; + Ok(res) + } Some(mime) if mime == APPLICATION_PDF => get_from_pdf(&file_data), Some(mime) if mime.type_() == TEXT => get_from_text(&file_data), Some(mime) if mime.type_() == IMAGE => get_from_image(&file_data), diff --git a/parser/src/parsers/docx.rs b/parser/src/parsers/docx.rs index 43a7c94..5101f7b 100644 --- a/parser/src/parsers/docx.rs +++ b/parser/src/parsers/docx.rs @@ -25,8 +25,10 @@ pub(crate) struct DocxParser { pub images: HashMap, } +// FIX: переделать под выдачу и текста и картинки с метакми в тексте (типа этот текст из картинки +// такой-то) impl DocxParser { - /// Creates a new [`DocxParser`]. + /// Создает новый [`DocxParser`]. pub(crate) fn new() -> Self { Self { images: HashMap::new(), @@ -40,7 +42,14 @@ impl DocxParser { /// /// # Returns /// - Ok([`String`]) - возвращает текст - /// - Err([`ParserError::DocxError`]) - ошибка во время парсинга docx файла + /// - Err([`ParserError`]) - ошибка во время парсинга docx файла + /// + /// # Errors + /// - [`ParserError::DocxError`] - ошибка во время docx + /// - [`ParserError::ImageError`] - ошибка во время парсинга картинки + /// - [`ParserError::ZipError`] - ошибка во время парсинга docx как zip + /// - [`ParserError::XmlError`] - ошибка во время парсинга конфигурационного файла docx + /// - Остальные [`ParserError`] связанные с Tesseract ошибки во время парсинга картинки pub(crate) fn get_from_docx(&mut self, data: &[u8]) -> Result { let dox = read_docx(data)?; // Вытаскиваем все картинки @@ -172,11 +181,11 @@ impl DocxParser { Ok(images_with_id) } - // ***************************************************************************** + // ************************************************************************* // Работа с элементами docx - // ***************************************************************************** + // ************************************************************************* - /// Проходится по всем детям `Paragraph` и извлекает из них текст + /// Проходится по всем детям [`docx_rs::Paragraph`] и извлекает из них текст fn paragraph_unwrap(&self, paragraph: &docx_rs::Paragraph) -> String { paragraph .children @@ -188,7 +197,7 @@ impl DocxParser { .collect::() } - /// Проходится по всем детям `Run` и извлекает из них текст + /// Проходится по всем детям [`docx_rs::Run`] и извлекает из них текст fn run_unwrap(&self, run: &docx_rs::Run) -> String { run.children .iter() @@ -200,7 +209,7 @@ impl DocxParser { .collect::() } - /// Извлекает текст из `Drawing`, если он есть + /// Извлекает текст из [`docx_rs::Drawing`], если он есть fn drawing_unwrap(&self, drawing: &docx_rs::Drawing) -> Result> { Ok(match &drawing.data { Some(docx_rs::DrawingData::Pic(pic)) => Some(self.pic_unwrap(pic)?), @@ -209,6 +218,7 @@ impl DocxParser { }) } + /// Подставляет текст с нужной картинки вместо [`docx_rs::Pic`] fn pic_unwrap(&self, pic: &docx_rs::Pic) -> Result { match self.images.get(&pic.id) { Some(text) => Ok(text.clone()), @@ -216,7 +226,7 @@ impl DocxParser { } } - /// Извлекает текст из `TextBox` + /// Извлекает текст из [`docx_rs::TextBox`] fn text_box_unwrap(&self, text_box: &docx_rs::TextBox) -> String { text_box .children @@ -230,7 +240,7 @@ impl DocxParser { .collect::() } - /// Проходится по всем детям `Table` и извлекает из них текст + /// Проходится по всем детям [`docx_rs::Table`] и извлекает из них текст fn table_unwrap(&self, table: &docx_rs::Table) -> String { table .rows @@ -241,7 +251,7 @@ impl DocxParser { .collect::() } - /// Извлекает текст из `TableRow` + /// Извлекает текст из [`docx_rs::TableRow`] fn table_row_unwrap(&self, table_row: &docx_rs::TableRow) -> String { table_row .cells @@ -256,7 +266,7 @@ impl DocxParser { .collect::() } - /// Извлекает текст из `TableCell` + /// Извлекает текст из [`docx_rs::TableCell`] fn table_cell_unwrap(&self, cell: &docx_rs::TableCell) -> String { cell.children .iter() diff --git a/parser/src/parsers/mod.rs b/parser/src/parsers/mod.rs index e4c0d96..af76190 100644 --- a/parser/src/parsers/mod.rs +++ b/parser/src/parsers/mod.rs @@ -1,6 +1,8 @@ //! Модуль для реализации парсеров -mod xml; pub(crate) mod docx; pub(crate) mod image; -pub(crate) mod text; pub(crate) mod pdf; +pub(crate) mod pptx; +pub(crate) mod text; + +mod xml; diff --git a/parser/src/parsers/pptx.rs b/parser/src/parsers/pptx.rs new file mode 100644 index 0000000..c7b7821 --- /dev/null +++ b/parser/src/parsers/pptx.rs @@ -0,0 +1,160 @@ +//! Парсинг pptx файлов +//! +//! Для парсинга используется crate rustypptx + +use std::collections::HashMap; + +use rayon::prelude::*; + +use crate::{errors::ParserError, parsers::image::get_from_image}; + +type Result = std::result::Result; +type SlideIndex = u32; +type ImgOnSlideNum = u32; +type ImagesInfo = HashMap<(SlideIndex, ImgOnSlideNum), Vec>; + +pub(crate) struct PptxParser { + /// HashMap для сопоставления байтов картинки с её местом в тексте слайда + pub slides_img_info: ImagesInfo, + /// Текст слайда (индекс слайда на 1 больше индекса в slides_text) + pub slides_text: Vec, +} + +impl PptxParser { + /// Создает новый [`PptxParser`]. + pub(crate) fn new() -> Self { + Self { + slides_img_info: HashMap::new(), + slides_text: Vec::new(), + } + } + + /// Извлекает текстовые данные и текст из картинок + /// + /// # Arguments + /// - `mut `[`self`] - сам парсер (забирает владение над парсером) + /// - `data` - слайс байтов данных из файла + /// + /// # Returns + /// - Ok([`String`]) - возвращает текст + /// - Err([`ParserError`]) - ошибка во время парсинга pptx файла + /// + /// # Errors + /// - [`ParserError::PptxError`] - ошибка во время парсинга pptx + /// - [`ParserError::ImageError`] - ошибка во время парсинга картинки + /// - Остальные [`ParserError`] связанные с Tesseract ошибки во время парсинга картинки + pub(crate) fn get_from_pptx(mut self, data: &[u8]) -> Result<(String, ImagesInfo)> { + let pptx_doc = rustypptx::parse_pptx_bytes(data)?; + let mut result_text = String::new(); + + if let Some(title) = &pptx_doc.metadata.title { + result_text.push_str(&format!("Название: {title}")); + } + + self.set_slides_text_and_img_info(pptx_doc); + result_text = self.add_text_from_img_in_slides()?; + + Ok((result_text, self.slides_img_info)) + } + + /// Заполняет текущий парсер данными из pptx файла для дальнейшей обработки + /// (текст и картинки со слайдов) + fn set_slides_text_and_img_info(&mut self, pptx_doc: rustypptx::PptxDocument) { + for slide in pptx_doc.slides.iter() { + for (ind, img) in slide.images.iter().enumerate() { + self.slides_img_info + .insert((slide.index, ind as u32), img.data.clone()); + } + + self.slides_text.push(format!( + "\n/*****************slide = {} ***************/\n {}\n", + slide.index, + slide + .text_elements + .iter() + .fold(String::new(), |mut sl_text, text_element| { + sl_text.push_str(&text_element.text); + sl_text.push('\n'); + sl_text + }) + )); + } + } + + /// Собирает текст из всех слайдов в единый текст, извлекая и подставляя + /// текст из картинок сладов в нужные места + /// + /// # Returns + /// - Ok([`String`]) - возвращает текст всех слайдов + /// - Err([`ParserError`]) - ошибка во время парсинга pptx файла + /// + /// # Errors + /// - [`ParserError::ImageError`] - ошибка во время парсинга картинки + /// - Остальные [`ParserError`] связанные с Tesseract ошибки во время парсинга картинки + fn add_text_from_img_in_slides(&mut self) -> Result { + Ok(self + .slides_text + .par_iter() + .enumerate() + .map(|(sl_ind, text)| { + let mut res_slide_text = String::from(text); + + for ((ind, img_num), data) in self + .slides_img_info + .iter() + .filter(|((ind, _), _)| *ind as usize == sl_ind + 1) + { + res_slide_text.push_str(&format!( + "\n/********slide = {ind}; img_num = {img_num}********/\n" + )); + + res_slide_text.push_str(&get_from_image(data)?); + res_slide_text + .push_str("\n/*****************************************************/"); + } + Ok(res_slide_text) + }) + .collect::>>()? + .join("\n")) + } +} + +#[cfg(test)] +mod tests { + use crate::{errors::ParserError, parsers::pptx::PptxParser}; + + type Result = std::result::Result; + + /// Считывает данные из файла ввиде byte vec + fn read_data_from_file(file_name: &str) -> Result> { + Ok(std::fs::read(file_name)?) + } + + fn extract_text_from_pptx(extract_file: &str, check_file: &str) -> Result<()> { + let data = read_data_from_file(extract_file)?; + let pars = PptxParser::new(); + let (res, _) = pars.get_from_pptx(&data)?; + + assert_eq!( + res.trim(), + String::from_utf8(read_data_from_file(check_file)?)?.trim() + ); + Ok(()) + } + + #[test] + fn extract_text_from_pptx_without_png() -> Result<()> { + extract_text_from_pptx( + "assets/pres_without_png.pptx", + "assets/tests_results/extract_text_from_pptx_without_png.txt", + ) + } + + #[test] + fn extract_text_from_pptx_with_png() -> Result<()> { + extract_text_from_pptx( + "assets/pres_with_png.pptx", + "assets/tests_results/extract_text_from_pptx_with_png.txt", + ) + } +} diff --git a/parser/tests/test.rs b/parser/tests/test.rs index e69de29..8b13789 100644 --- a/parser/tests/test.rs +++ b/parser/tests/test.rs @@ -0,0 +1 @@ +