Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,7 @@
# print(docs_parser.get_text("parser/assets/main.typ"))
# print(docs_parser.get_text("parser/assets/main.pdf"))
# print(docs_parser.get_text("parser/assets/too_many_png.docx"))
print(docs_parser.get_text("parser/assets/Presentation.pptx"))
# print(docs_parser.get_text("parser/assets/Presentation.pptx"))
docs_parser.convert_to_new_format("parser/assets/old_docs.doc", "parser/assets/tests_results")
docs_parser.convert_to_new_format("parser/assets/old_pres.ppt", "parser/assets/tests_results")
docs_parser.convert_to_new_format("parser/assets/old_exel.xls", "parser/assets/tests_results")
2 changes: 1 addition & 1 deletion parser/docs_parser.pyi
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# Get text from file
def get_text(from_path: str) -> str: ...
def convert_to_new_format(old_file_path: str, new_path: str): ...
8 changes: 7 additions & 1 deletion parser/src/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ pub const APPLICATION_DOCX: &str =
/// MIME тип для DOCX документов
pub const APPLICATION_DOCX_ZIP: &str = "application/zip";

/// MIME тип для DOC/RTF документов
/// MIME тип для DOC документов
pub const APPLICATION_DOC: &str = "application/msword";

/// MIME тип для RTF документов
pub const APPLICATION_RTF: &str = "application/rtf";

/// MIME тип для XLSX (Microsoft Excel)
Expand All @@ -25,3 +28,6 @@ pub const APPLICATION_XLS: &str = "application/vnd.ms-excel";
/// MIME тип для PPTX (Microsoft `PowerPoint`) презентаций
pub const APPLICATION_PPTX: &str =
"application/vnd.openxmlformats-officedocument.presentationml.presentation";

/// MIME тип для PPT (Microsoft `PowerPoint`) презентаций
pub const APPLICATION_PPT: &str = "application/vnd.ms-powerpoint";
81 changes: 81 additions & 0 deletions parser/src/converter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
//! Модуль для перевода старых форматов документов Microsoft office в новые
//!
//! Используется cli утилита поставляемая libreoffice - soffice

use std::process::{Command, Stdio};

use crate::{
constants::{APPLICATION_DOC, APPLICATION_PPT, APPLICATION_RTF, APPLICATION_XLS},
errors::ParserError,
match_parsers::{define_mime_type, read_data_from_file},
};

type Result<T> = std::result::Result<T, ParserError>;

/// Поддерживаемые типы старых форматов Microsoft office
enum MSOfficeFormat {
/// doc like форматы
Doc,
/// xls like форматы
Xls,
/// ppt like форматы
Ppt,
}

/// Конвертер старых Microsoft office форматов в новые
///
/// Определяет формат и вызывает конвертацию файла
/// # Arguments
/// - `old_file_path` - путь по которому лежит файл старого формата
/// - `new_path` - путь по которому должен появить файл нового формата
///
/// # Errors
/// - [`ParserError::InvalidFormat`] - тип файла не поддерживается/не определен
/// - [`ParserError::IoError`] - проблемы с libreoffice
pub(crate) fn convert_to_new_format(old_file_path: &str, new_path: &str) -> Result<()> {
let file_data = read_data_from_file(old_file_path)?;
match define_mime_type(&file_data) {
Some(mime) if mime == APPLICATION_RTF || mime == APPLICATION_DOC => {
converter_files(MSOfficeFormat::Doc, old_file_path, new_path)
}
Some(mime) if mime == APPLICATION_XLS => {
converter_files(MSOfficeFormat::Xls, old_file_path, new_path)
}
Some(mime) if mime == APPLICATION_PPT => {
converter_files(MSOfficeFormat::Ppt, old_file_path, new_path)
}
Some(mime) => Err(ParserError::InvalidFormat(format!(
"Не поддерживается данный тип файла {mime}"
))),
None => Err(ParserError::InvalidFormat(
"Не получается определить данный тип файла ".to_string(),
)),
}
}

/// Конвертирует файл в новый формат в зависимости от типа
/// # Arguments
/// - `old_file_path` - путь по которому лежит файл старого формата
/// - `new_path` - путь по которому должен появить файл нового формата
///
/// # Errors
/// - [`ParserError::IoError`] - проблемы с libreoffice
fn converter_files(type_format: MSOfficeFormat, old_file_path: &str, new_path: &str) -> Result<()> {
let type_convert = match type_format {
MSOfficeFormat::Doc => "docx",
MSOfficeFormat::Xls => "xlsx",
MSOfficeFormat::Ppt => "pptx",
};

Command::new("soffice")
.arg("--headless")
.arg("--convert-to")
.arg(type_convert)
.arg(old_file_path)
.arg("--outdir")
.arg(new_path)
.stdout(Stdio::null())
.stderr(Stdio::null())
.status()?;
Ok(())
}
12 changes: 11 additions & 1 deletion parser/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
mod constants;
mod converter;
mod errors;
mod match_parsers;
mod parsers;
Expand All @@ -10,17 +11,26 @@ use pyo3::{PyResult, types::PyModule};
mod parser {
use pyo3::prelude::*;

/// Parsing text `from` file by `path`
/// Парсинг текста `from` файла по `path`
#[pyo3::pyfunction]
pub fn get_text(from_path: &str) -> PyResult<String> {
Ok(crate::match_parsers::get_text(from_path)?)
}

/// Конвертер старых Microsoft office форматов в новые
#[pyo3::pyfunction]
pub fn convert_to_new_format(old_file_path: &str, new_path: &str) -> PyResult<()> {
Ok(crate::converter::convert_to_new_format(
old_file_path,
new_path,
)?)
}
}

/// Функция реализации python модуля, добавляющая в него функции
#[pymodule]
fn docs_parser(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(parser::get_text, m)?)?;
m.add_function(wrap_pyfunction!(parser::convert_to_new_format, m)?)?;
Ok(())
}
2 changes: 1 addition & 1 deletion parser/src/match_parsers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ pub(crate) fn define_mime_type(file_data: &[u8]) -> Option<Mime> {
}

/// Считывает данные из файла ввиде byte vec
fn read_data_from_file(file_name: &str) -> Result<Vec<u8>> {
pub(crate) fn read_data_from_file(file_name: &str) -> Result<Vec<u8>> {
Ok(std::fs::read(file_name)?)
}

Expand Down