Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 13 additions & 11 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import docs_parser

# NOTE: все эти точно работают и работают хорошо
# print(docs_parser.get_text("parser/assets/text_and_tables.docx"))
# print(docs_parser.get_text("parser/assets/some_text.docx"))
# print(docs_parser.get_text("parser/assets/text_tables_png.docx"))
# print(docs_parser.get_text("parser/assets/text_from_img.png"))
# print(docs_parser.get_text("parser/assets/main.typ"))
# print(docs_parser.get_text("parser/assets/main.pdf"))
# print(docs_parser.get_text("parser/assets/too_many_png.docx"))
# print(docs_parser.get_text("parser/assets/Presentation.pptx"))
docs_parser.convert_to_new_format("parser/assets/old_docs.doc", "parser/assets/tests_results")
docs_parser.convert_to_new_format("parser/assets/old_pres.ppt", "parser/assets/tests_results")
docs_parser.convert_to_new_format("parser/assets/old_exel.xls", "parser/assets/tests_results")
# (doc_p, _) = docs_parser.get_text("parser/assets/text_and_tables.docx")
# (doc_p, _) = docs_parser.get_text("parser/assets/text_and_tables.docx")
# (doc_p, _) = docs_parser.get_text("parser/assets/some_text.docx")
# (doc_p, _) = docs_parser.get_text("parser/assets/text_tables_png.docx")
# (doc_p, _) = docs_parser.get_text("parser/assets/text_from_img.png")
# (doc_p, _) = docs_parser.get_text("parser/assets/main.typ")
# (doc_p, _) = docs_parser.get_text("parser/assets/main.pdf")
# (doc_p, _) = docs_parser.get_text("parser/assets/too_many_png.docx")
# (doc_p, _) = docs_parser.get_text("parser/assets/Presentation.pptx")
# print(doc_p)
# docs_parser.convert_to_new_format("parser/assets/old_docs.doc", "parser/assets/tests_results")
# docs_parser.convert_to_new_format("parser/assets/old_pres.ppt", "parser/assets/tests_results")
# docs_parser.convert_to_new_format("parser/assets/old_exel.xls", "parser/assets/tests_results")
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,8 @@ A11 (в ней же таблица)A11_11 A11_12 A11_21 A11_22 a12 a13 a21 a23



МЯУ=191919

/************Image = 0************/
МЯУ=191919
/*************************************/

Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@

/*****************slide = 1 ***************/
/********************slide = 1********************/
Тема презентации
Подзаголовок презентации



/*****************slide = 2 ***************/
/********************slide = 2********************/
Текст заголовка со слайда
Абиба


/********slide = 2; img_num = 0********/
МЯУ=191919
/*****************************************************/
/**************************************************/
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@

/*****************slide = 1 ***************/
/********************slide = 1********************/
Тема презентации
Подзаголовок презентации



/*****************slide = 2 ***************/
/********************slide = 2********************/
Текст заголовка со слайда
Абиба

2 changes: 1 addition & 1 deletion parser/docs_parser.pyi
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
def get_text(from_path: str) -> str: ...
def get_text(from_path: str) -> tuple[str, dict[tuple[int, int], bytes]]: ...
def convert_to_new_format(old_file_path: str, new_path: str): ...
6 changes: 5 additions & 1 deletion parser/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,15 @@ use pyo3::{PyResult, types::PyModule};

/// Модуль для реализации функций модуля `docs_parser`
mod parser {
use std::collections::HashMap;

use pyo3::prelude::*;
type ImgNumber = u32;
type ImagesInfo = HashMap<(u32, ImgNumber), Vec<u8>>;

/// Парсинг текста `from` файла по `path`
#[pyo3::pyfunction]
pub fn get_text(from_path: &str) -> PyResult<String> {
pub fn get_text(from_path: &str) -> PyResult<(String, ImagesInfo)> {
Ok(crate::match_parsers::get_text(from_path)?)
}

Expand Down
17 changes: 9 additions & 8 deletions parser/src/match_parsers.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! Функции определения MIME и выбора парсера в зависимости от MIME

use std::{str::from_utf8, sync::LazyLock};
use std::{collections::HashMap, str::from_utf8, sync::LazyLock};

use infer::Infer;
use mime::{IMAGE, Mime, TEXT, TEXT_PLAIN};
Expand All @@ -15,6 +15,8 @@ use crate::{
};

type Result<T> = std::result::Result<T, ParserError>;
type ImgNumber = u32;
type ImagesInfo = HashMap<(u32, ImgNumber), Vec<u8>>;

static INFER: LazyLock<Infer> = LazyLock::new(Infer::new);

Expand All @@ -30,25 +32,24 @@ static INFER: LazyLock<Infer> = LazyLock::new(Infer::new);
/// # Errors
/// - [`ParserError::InvalidFormat`] - тип файла не поддерживается/не определен
/// - Остальные варианты [`ParserError`], если ошибка во время парсинга файла
pub fn get_text(file_name: &str) -> Result<String> {
pub fn get_text(file_name: &str) -> Result<(String, ImagesInfo)> {
let file_data = read_data_from_file(file_name)?;
match define_mime_type(&file_data) {
Some(mime)
if mime == APPLICATION_DOCX
|| (mime == APPLICATION_DOCX_ZIP && file_name.ends_with(".docx")) =>
{
let mut docx_parser = docx::DocxParser::new();
let docx_parser = docx::DocxParser::new();
docx_parser.get_from_docx(&file_data)
}
Some(mime) if mime == APPLICATION_XLSX => todo!(),
Some(mime) if mime == APPLICATION_PPTX => {
let pptx_parser = pptx::PptxParser::new();
let (res, _) = pptx_parser.get_from_pptx(&file_data)?;
Ok(res)
pptx_parser.get_from_pptx(&file_data)
}
Some(mime) if mime == APPLICATION_PDF => get_from_pdf(&file_data),
Some(mime) if mime.type_() == TEXT => get_from_text(&file_data),
Some(mime) if mime.type_() == IMAGE => get_from_image(&file_data),
Some(mime) if mime == APPLICATION_PDF => Ok((get_from_pdf(&file_data)?, HashMap::new())),
Some(mime) if mime.type_() == TEXT => Ok((get_from_text(&file_data)?, HashMap::new())),
Some(mime) if mime.type_() == IMAGE => Ok((get_from_image(&file_data)?, HashMap::new())),
Some(mime) if is_converted_mime_type(&mime) => Err(ParserError::InvalidFormat(format!(
"Не поддерживается данный тип файла {mime}, но его вы можете конвертировать \
в поддерживаемый формат через отдельный метод конвертации"
Expand Down
92 changes: 58 additions & 34 deletions parser/src/parsers/docx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,25 @@ use zip::ZipArchive;
type Result<T> = std::result::Result<T, ParserError>;
type Id = String;
type Target = String;
type ImgNumber = u32;
type ImagesInfo = HashMap<(u32, ImgNumber), Vec<u8>>;

pub(crate) struct DocxParser {
/// HashMap, где хранятся id картинок и текст извлеченный из них
pub images: HashMap<Id, String>,
pub img_info: ImagesInfo,
temp_img_info: HashMap<Id, Vec<u8>>,
cur_img_ind: ImgNumber,
}

// FIX: переделать под выдачу и текста и картинки с метакми в тексте (типа этот текст из картинки
// такой-то)
impl DocxParser {
/// Создает новый [`DocxParser`].
pub(crate) fn new() -> Self {
Self {
images: HashMap::new(),
img_info: HashMap::new(),
temp_img_info: HashMap::new(),
cur_img_ind: 0,
}
}

Expand All @@ -50,33 +56,35 @@ impl DocxParser {
/// - [`ParserError::ZipError`] - ошибка во время парсинга docx как zip
/// - [`ParserError::XmlError`] - ошибка во время парсинга конфигурационного файла docx
/// - Остальные [`ParserError`] связанные с Tesseract ошибки во время парсинга картинки
pub(crate) fn get_from_docx(&mut self, data: &[u8]) -> Result<String> {
pub(crate) fn get_from_docx(mut self, data: &[u8]) -> Result<(String, ImagesInfo)> {
let dox = read_docx(data)?;
// Вытаскиваем все картинки
let images_bytes = self.extract_images_from_docx(data)?;
// Парсим текст из картинок
self.extract_text_from_images(images_bytes)?;

Ok(dox
.document
.children
.iter()
.filter_map(|from| match from {
docx_rs::DocumentChild::Paragraph(paragraph) => Some({
let mut paragraph_text = self.paragraph_unwrap(paragraph);
paragraph_text.push('\n');
paragraph_text
}),
docx_rs::DocumentChild::Table(table) => Some({
let mut table_text = self.table_unwrap(table);
table_text.push('\n');
table_text
}),
_ => None,
})
.collect::<Vec<String>>()
.join("\n")
.to_string())
Ok((
dox.document
.children
.iter()
.filter_map(|from| match from {
docx_rs::DocumentChild::Paragraph(paragraph) => Some({
let mut paragraph_text = self.paragraph_unwrap(paragraph);
paragraph_text.push('\n');
paragraph_text
}),
docx_rs::DocumentChild::Table(table) => Some({
let mut table_text = self.table_unwrap(table);
table_text.push('\n');
table_text
}),
_ => None,
})
.collect::<Vec<String>>()
.join("\n")
.to_string(),
self.img_info,
))
}

/// Проходится по всем парам
Expand All @@ -92,6 +100,7 @@ impl DocxParser {
/// - [`ParserError::ImageError`] - ошибка во время парсинга картинки
/// - Остальные [`ParserError`] связанные с Tesseract ошибки во время парсинга картинки
fn extract_text_from_images(&mut self, images: HashMap<Id, Vec<u8>>) -> Result<()> {
self.temp_img_info = images.clone();
self.images = images
.into_par_iter()
.map(|(id, data)| Ok((id, get_from_image(&data)?)))
Expand Down Expand Up @@ -186,7 +195,7 @@ impl DocxParser {
// *************************************************************************

/// Проходится по всем детям [`docx_rs::Paragraph`] и извлекает из них текст
fn paragraph_unwrap(&self, paragraph: &docx_rs::Paragraph) -> String {
fn paragraph_unwrap(&mut self, paragraph: &docx_rs::Paragraph) -> String {
paragraph
.children
.iter()
Expand All @@ -198,7 +207,7 @@ impl DocxParser {
}

/// Проходится по всем детям [`docx_rs::Run`] и извлекает из них текст
fn run_unwrap(&self, run: &docx_rs::Run) -> String {
fn run_unwrap(&mut self, run: &docx_rs::Run) -> String {
run.children
.iter()
.filter_map(|from| match from {
Expand All @@ -210,7 +219,7 @@ impl DocxParser {
}

/// Извлекает текст из [`docx_rs::Drawing`], если он есть
fn drawing_unwrap(&self, drawing: &docx_rs::Drawing) -> Result<Option<String>> {
fn drawing_unwrap(&mut self, drawing: &docx_rs::Drawing) -> Result<Option<String>> {
Ok(match &drawing.data {
Some(docx_rs::DrawingData::Pic(pic)) => Some(self.pic_unwrap(pic)?),
Some(docx_rs::DrawingData::TextBox(text_box)) => Some(self.text_box_unwrap(text_box)),
Expand All @@ -219,15 +228,30 @@ impl DocxParser {
}

/// Подставляет текст с нужной картинки вместо [`docx_rs::Pic`]
fn pic_unwrap(&self, pic: &docx_rs::Pic) -> Result<String> {
fn pic_unwrap(&mut self, pic: &docx_rs::Pic) -> Result<String> {
match self.images.get(&pic.id) {
Some(text) => Ok(text.clone()),
Some(text) => {
let data = self
.temp_img_info
.remove(&pic.id)
.expect("Байты картинки обязаны существовать в момент работы с картинкой");
let num = self.cur_img_ind;

self.img_info.insert((0, num), data);
self.cur_img_ind += 1;

Ok(format!(
"\n/************Image = {num}************/\n \
{text} \
\n/*************************************/\n",
))
}
None => Ok(String::new()),
}
}

/// Извлекает текст из [`docx_rs::TextBox`]
fn text_box_unwrap(&self, text_box: &docx_rs::TextBox) -> String {
fn text_box_unwrap(&mut self, text_box: &docx_rs::TextBox) -> String {
text_box
.children
.iter()
Expand All @@ -241,7 +265,7 @@ impl DocxParser {
}

/// Проходится по всем детям [`docx_rs::Table`] и извлекает из них текст
fn table_unwrap(&self, table: &docx_rs::Table) -> String {
fn table_unwrap(&mut self, table: &docx_rs::Table) -> String {
table
.rows
.iter()
Expand All @@ -252,7 +276,7 @@ impl DocxParser {
}

/// Извлекает текст из [`docx_rs::TableRow`]
fn table_row_unwrap(&self, table_row: &docx_rs::TableRow) -> String {
fn table_row_unwrap(&mut self, table_row: &docx_rs::TableRow) -> String {
table_row
.cells
.iter()
Expand All @@ -267,7 +291,7 @@ impl DocxParser {
}

/// Извлекает текст из [`docx_rs::TableCell`]
fn table_cell_unwrap(&self, cell: &docx_rs::TableCell) -> String {
fn table_cell_unwrap(&mut self, cell: &docx_rs::TableCell) -> String {
cell.children
.iter()
.filter_map(|from_cell_content| match from_cell_content {
Expand Down Expand Up @@ -339,8 +363,8 @@ mod tests {

fn extract_text_from_docx(extract_file: &str, check_file: &str) -> Result<()> {
let data = read_data_from_file(extract_file)?;
let mut pars = DocxParser::new();
let res = pars.get_from_docx(&data)?;
let pars = DocxParser::new();
let (res, _) = pars.get_from_docx(&data)?;

assert_eq!(
res.trim(),
Expand Down
4 changes: 2 additions & 2 deletions parser/src/parsers/pptx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ impl PptxParser {
}

self.slides_text.push(format!(
"\n/*****************slide = {} ***************/\n {}\n",
"\n/********************slide = {}********************/\n {}\n",
slide.index,
slide
.text_elements
Expand Down Expand Up @@ -110,7 +110,7 @@ impl PptxParser {

res_slide_text.push_str(&get_from_image(data)?);
res_slide_text
.push_str("\n/*****************************************************/");
.push_str("\n/**************************************************/\n");
}
Ok(res_slide_text)
})
Expand Down
Loading