diff --git a/dedoc/readers/docx_reader/data_structures/docx_document.py b/dedoc/readers/docx_reader/data_structures/docx_document.py index 7160b0f7..4c18daee 100644 --- a/dedoc/readers/docx_reader/data_structures/docx_document.py +++ b/dedoc/readers/docx_reader/data_structures/docx_document.py @@ -12,8 +12,8 @@ from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.readers.docx_reader.data_structures.table import DocxTable from dedoc.readers.docx_reader.data_structures.utils import Counter, ParagraphMaker -from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor from dedoc.readers.docx_reader.line_with_meta_converter import LineWithMetaConverter +from dedoc.readers.docx_reader.note_extractor import NoteExtractor from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor from dedoc.readers.docx_reader.styles_extractor import StylesExtractor from dedoc.utils.office_utils import get_bs_from_zip @@ -47,8 +47,9 @@ def __get_paragraph_maker(self) -> ParagraphMaker: path_hash=calculate_file_hash(path=self.path), styles_extractor=styles_extractor, numbering_extractor=numbering_extractor, - footnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/footnotes.xml")), - endnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/endnotes.xml"), key="endnote") + footnote_extractor=NoteExtractor(get_bs_from_zip(self.path, "word/footnotes.xml")), + endnote_extractor=NoteExtractor(get_bs_from_zip(self.path, "word/endnotes.xml"), key="endnote"), + comment_extractor=NoteExtractor(get_bs_from_zip(self.path, "word/comments.xml"), key="comment") ) def __get_lines(self) -> List[LineWithMeta]: diff --git a/dedoc/readers/docx_reader/data_structures/paragraph.py b/dedoc/readers/docx_reader/data_structures/paragraph.py index 862dba1d..16cd40ed 100644 --- a/dedoc/readers/docx_reader/data_structures/paragraph.py +++ b/dedoc/readers/docx_reader/data_structures/paragraph.py @@ -4,7 +4,7 @@ from dedoc.readers.docx_reader.data_structures.base_props import BaseProperties from dedoc.readers.docx_reader.data_structures.run import Run -from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor +from dedoc.readers.docx_reader.note_extractor import NoteExtractor from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor from dedoc.readers.docx_reader.properties_extractor import change_paragraph_properties, change_run_properties from dedoc.readers.docx_reader.styles_extractor import StyleType, StylesExtractor @@ -16,8 +16,9 @@ def __init__(self, xml: Tag, styles_extractor: StylesExtractor, numbering_extractor: NumberingExtractor, - footnote_extractor: FootnoteExtractor, - endnote_extractor: FootnoteExtractor, + footnote_extractor: NoteExtractor, + endnote_extractor: NoteExtractor, + comment_extractor: NoteExtractor, uid: str) -> None: """ Contains information about paragraph properties. @@ -30,9 +31,10 @@ def __init__(self, self.xml = xml self.footnote_extractor = footnote_extractor self.endnote_extractor = endnote_extractor + self.comment_extractor = comment_extractor self.numbering_extractor = numbering_extractor self.styles_extractor = styles_extractor - self.footnotes = [] + self.notes = [] self.runs = [] self.runs_ids = [] # list of (start, end) inside the paragraph text self.text = "" @@ -85,12 +87,8 @@ def __parse(self) -> None: if hasattr(self, "caps") and self.caps: self.text = self.text.upper() - for key, extractor in [("w:footnoteReference", self.footnote_extractor), ("w:endnoteReference", self.endnote_extractor)]: - notes = self.xml.find_all(key) - for footnote in notes: - note_id = footnote.get("w:id") - if note_id in extractor.id2footnote: - self.footnotes.append(extractor.id2footnote[note_id]) + for extractor in [self.footnote_extractor, self.endnote_extractor]: + self.notes.extend(extractor.get_notes(self.xml)) def __get_numbering_formatting(self) -> Optional[Run]: """ @@ -99,7 +97,7 @@ def __get_numbering_formatting(self) -> Optional[Run]: :returns: numbering run if there is the text in numbering else None """ if self.xml.numPr and self.numbering_extractor: - numbering_run = Run(self, self.styles_extractor) + numbering_run = Run(self, self.styles_extractor, self.comment_extractor) self.numbering_extractor.parse(self.xml.numPr, self, numbering_run) if numbering_run.text: @@ -115,7 +113,7 @@ def __make_run_list(self) -> None: run_list = self.xml.find_all("w:r") for run_tree in run_list: - new_run = Run(self, self.styles_extractor) + new_run = Run(self, self.styles_extractor, self.comment_extractor) if run_tree.rStyle: self.styles_extractor.parse(run_tree.rStyle["w:val"], new_run, StyleType.CHARACTER) @@ -126,6 +124,9 @@ def __make_run_list(self) -> None: change_run_properties(new_run, run_tree.rPr) new_run.get_text(run_tree) if not new_run.text: + if new_run.linked_text and self.runs: + prev_linked_text = self.runs[-1].linked_text + self.runs[-1].linked_text = new_run.linked_text if not prev_linked_text else f"{prev_linked_text}; {new_run.linked_text}" continue if self.runs and self.runs[-1] == new_run: diff --git a/dedoc/readers/docx_reader/data_structures/run.py b/dedoc/readers/docx_reader/data_structures/run.py index 138ef6c0..0f4bca98 100644 --- a/dedoc/readers/docx_reader/data_structures/run.py +++ b/dedoc/readers/docx_reader/data_structures/run.py @@ -3,21 +3,25 @@ from bs4 import Tag from dedoc.readers.docx_reader.data_structures.base_props import BaseProperties +from dedoc.readers.docx_reader.note_extractor import NoteExtractor from dedoc.readers.docx_reader.properties_extractor import change_caps class Run(BaseProperties): - def __init__(self, properties: Optional[BaseProperties], styles_extractor: "StylesExtractor") -> None: # noqa + def __init__(self, properties: Optional[BaseProperties], styles_extractor: "StylesExtractor", comment_extractor: Optional[NoteExtractor] = None) -> None: # noqa """ Contains information about run properties. :param properties: Paragraph or Run for copying its properties :param styles_extractor: StylesExtractor + :param comment_extractor: NoteExtractor for comments """ self.name2char = dict(tab="\t", br="\n", cr="\r") self.text = "" + self.linked_text = "" self.styles_extractor = styles_extractor + self.comment_extractor = comment_extractor super().__init__(properties) def get_text(self, xml: Tag) -> None: @@ -25,6 +29,10 @@ def get_text(self, xml: Tag) -> None: Makes the text of run. :param xml: BeautifulSoup tree with run properties """ + notes = self.comment_extractor.get_notes(xml) if self.comment_extractor else None + if notes: + self.linked_text = "; ".join(notes) + for tag in xml: tag_name = tag.name @@ -56,4 +64,5 @@ def __eq__(self, other: "Run") -> bool: size_eq = self.size == other.size font_eq = self.bold == other.bold and self.italic == other.italic and self.underlined == other.underlined script_eq = self.superscript == other.superscript and self.subscript == other.subscript - return size_eq and font_eq and script_eq + linked_text_eq = self.linked_text == other.linked_text + return size_eq and font_eq and script_eq and linked_text_eq diff --git a/dedoc/readers/docx_reader/data_structures/utils.py b/dedoc/readers/docx_reader/data_structures/utils.py index 0eb54158..05b404fc 100644 --- a/dedoc/readers/docx_reader/data_structures/utils.py +++ b/dedoc/readers/docx_reader/data_structures/utils.py @@ -6,7 +6,7 @@ from bs4 import Tag from dedoc.readers.docx_reader.data_structures.paragraph import Paragraph -from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor +from dedoc.readers.docx_reader.note_extractor import NoteExtractor from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor from dedoc.readers.docx_reader.styles_extractor import StylesExtractor @@ -35,14 +35,16 @@ def __init__(self, counter: Counter, styles_extractor: StylesExtractor, numbering_extractor: NumberingExtractor, - footnote_extractor: FootnoteExtractor, - endnote_extractor: FootnoteExtractor) -> None: + footnote_extractor: NoteExtractor, + endnote_extractor: NoteExtractor, + comment_extractor: NoteExtractor) -> None: self.counter = counter self.path_hash = path_hash self.styles_extractor = styles_extractor self.numbering_extractor = numbering_extractor self.footnote_extractor = footnote_extractor self.endnote_extractor = endnote_extractor + self.comment_extractor = comment_extractor self.uids_set = set() def make_paragraph(self, paragraph_xml: Tag, paragraph_list: List[Paragraph]) -> Paragraph: @@ -52,6 +54,7 @@ def make_paragraph(self, paragraph_xml: Tag, paragraph_list: List[Paragraph]) -> numbering_extractor=self.numbering_extractor, footnote_extractor=self.footnote_extractor, endnote_extractor=self.endnote_extractor, + comment_extractor=self.comment_extractor, uid=uid) prev_paragraph = None if len(paragraph_list) == 0 else paragraph_list[-1] paragraph.spacing = paragraph.spacing_before if prev_paragraph is None else max(prev_paragraph.spacing_after, paragraph.spacing_before) diff --git a/dedoc/readers/docx_reader/footnote_extractor.py b/dedoc/readers/docx_reader/footnote_extractor.py deleted file mode 100644 index 1eb6732e..00000000 --- a/dedoc/readers/docx_reader/footnote_extractor.py +++ /dev/null @@ -1,21 +0,0 @@ -from typing import Optional - -from bs4 import BeautifulSoup - - -class FootnoteExtractor: - - def __init__(self, xml: Optional[BeautifulSoup], key: str = "footnote") -> None: - """ - :param xml: BeautifulSoup tree with styles - :param key: footnote or endnote - """ - self.id2footnote = {} - if not xml: - return - - for footnote in xml.find_all(f"w:{key}"): - footnote_id = footnote.get("w:id") - footnote_text = " ".join(t.text for t in footnote.find_all("w:t") if t.text) - if footnote_id and footnote_text: - self.id2footnote[footnote_id] = footnote_text diff --git a/dedoc/readers/docx_reader/line_with_meta_converter.py b/dedoc/readers/docx_reader/line_with_meta_converter.py index ead068d7..ad76e670 100644 --- a/dedoc/readers/docx_reader/line_with_meta_converter.py +++ b/dedoc/readers/docx_reader/line_with_meta_converter.py @@ -24,7 +24,9 @@ def __init__(self, paragraph: Paragraph, paragraph_id: int) -> None: Converts custom DOCX Paragraph to LineWithMeta class. :param paragraph: Paragraph for converting its properties to the unified representation. """ - annotations = [BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation] + annotations = [ + BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation, LinkedTextAnnotation + ] self.dict2annotation = {annotation.name: annotation for annotation in annotations} self.annotation_merger = AnnotationMerger() @@ -37,8 +39,8 @@ def __parse(self, paragraph: Paragraph, paragraph_id: int) -> LineWithMeta: AlignmentAnnotation(start=0, end=len(paragraph.text), value=paragraph.jc), SpacingAnnotation(start=0, end=len(paragraph.text), value=str(paragraph.spacing)) ] - for footnote in paragraph.footnotes: - annotations.append(LinkedTextAnnotation(start=0, end=len(paragraph.text), value=footnote)) + for note in paragraph.notes: + annotations.append(LinkedTextAnnotation(start=0, end=len(paragraph.text), value=note)) if paragraph.style_name is not None: annotations.append(StyleAnnotation(start=0, end=len(paragraph.text), value=paragraph.style_name)) @@ -47,7 +49,7 @@ def __parse(self, paragraph: Paragraph, paragraph_id: int) -> LineWithMeta: for run, (start, end) in zip(paragraph.runs, paragraph.runs_ids): annotations.append(SizeAnnotation(start=start, end=end, value=str(run.size / 2))) - for property_name in ["bold", "italic", "underlined", "strike", "superscript", "subscript"]: + for property_name in self.dict2annotation: property_value = getattr(run, property_name) if property_value: annotations.append(self.dict2annotation[property_name](start=start, end=end, value=str(property_value))) diff --git a/dedoc/readers/docx_reader/note_extractor.py b/dedoc/readers/docx_reader/note_extractor.py new file mode 100644 index 00000000..c319d186 --- /dev/null +++ b/dedoc/readers/docx_reader/note_extractor.py @@ -0,0 +1,33 @@ +from typing import Dict, List, Optional + +from bs4 import BeautifulSoup, Tag + + +class NoteExtractor: + + def __init__(self, xml: Optional[BeautifulSoup], key: str = "footnote") -> None: + """ + :param xml: BeautifulSoup tree with styles + :param key: footnote, endnote or comment + """ + self.key = key + self.id2note: Dict[str, str] = {} + if not xml: + return + + for note in xml.find_all(f"w:{key}"): + note_id = note.get("w:id") + note_text = " ".join(t.text for t in note.find_all("w:t") if t.text) + author = note.get("w:author") + note_text = f"{author}: {note_text}" if author else note_text + if note_id and note_text: + self.id2note[note_id] = note_text + + def get_notes(self, xml: Tag) -> List[str]: + notes_xml = xml.find_all(f"w:{self.key}Reference") + notes = [] + for note in notes_xml: + note_id = note.get("w:id") + if note_id in self.id2note: + notes.append(self.id2note[note_id]) + return notes diff --git a/dedoc/readers/excel_reader/excel_reader.py b/dedoc/readers/excel_reader/excel_reader.py index a68a92e5..b9846975 100644 --- a/dedoc/readers/excel_reader/excel_reader.py +++ b/dedoc/readers/excel_reader/excel_reader.py @@ -2,6 +2,7 @@ from xlrd.sheet import Sheet +from dedoc.data_structures.concrete_annotations.linked_text_annotation import LinkedTextAnnotation from dedoc.data_structures.table import Table from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.readers.base_reader import BaseReader @@ -54,8 +55,13 @@ def __parse_sheet(self, sheet_id: int, sheet: Sheet) -> Table: for row_id in range(n_rows): row = [] for col_id in range(n_cols): - value = str(sheet.cell_value(rowx=row_id, colx=col_id)) - row.append(CellWithMeta(lines=[LineWithMeta(line=value, metadata=LineMetadata(page_id=sheet_id, line_id=0))])) + cell_text = str(sheet.cell_value(rowx=row_id, colx=col_id)) + if (row_id, col_id) in sheet.cell_note_map: + note_text = sheet.cell_note_map[(row_id, col_id)].text.replace("\n", " ") + annotations = [LinkedTextAnnotation(start=0, end=len(cell_text), value=note_text)] + else: + annotations = [] + row.append(CellWithMeta(lines=[LineWithMeta(line=cell_text, metadata=LineMetadata(page_id=sheet_id, line_id=0), annotations=annotations)])) res.append(row) metadata = TableMetadata(page_id=sheet_id) return Table(cells=res, metadata=metadata) diff --git a/docker-compose.yml b/docker-compose.yml index 93116e97..0460a3bb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -29,6 +29,6 @@ services: is_test: $test grobid: - image: "lfoppiano/grobid:0.8.0" + image: "grobid/grobid:0.8.2" ports: - 8070:8070 diff --git a/tests/api_tests/test_api_doctype_article.py b/tests/api_tests/test_api_doctype_article.py index bef10773..f19642ff 100644 --- a/tests/api_tests/test_api_doctype_article.py +++ b/tests/api_tests/test_api_doctype_article.py @@ -40,12 +40,12 @@ def test_article(self) -> None: # check bibliography list self.assertEqual("bibliography", self._get_by_tree_path(tree, "0.12")["metadata"]["paragraph_type"]) - self.assertEqual(65, len(self._get_by_tree_path(tree, "0.12")["subparagraphs"])) + self.assertEqual(64, len(self._get_by_tree_path(tree, "0.12")["subparagraphs"])) # check bib_item 1 recognizing self.assertEqual("title", self._get_by_tree_path(tree, "0.12.0.0")["metadata"]["paragraph_type"]) self.assertEqual("Leakage-resilient symmetric encryption via re-keying", self._get_by_tree_path(tree, "0.12.0.0")["text"]) - self.assertEqual("title_conference_proceedings", self._get_by_tree_path(tree, "0.12.0.1")["metadata"]["paragraph_type"]) + self.assertEqual("title_journal", self._get_by_tree_path(tree, "0.12.0.1")["metadata"]["paragraph_type"]) self.assertEqual("Bertoni and Coron", self._get_by_tree_path(tree, "0.12.0.1")["text"]) self.assertEqual("author", self._get_by_tree_path(tree, "0.12.0.2")["metadata"]["paragraph_type"]) # author 1 self.assertEqual("Michel Abdalla", self._get_by_tree_path(tree, "0.12.0.2")["text"]) @@ -55,12 +55,12 @@ def test_article(self) -> None: self.assertEqual("471-488", self._get_by_tree_path(tree, "0.12.0.6")["text"]) # check cite on bib_item - bibliography_item_uuid = self._get_by_tree_path(tree, "0.12.57")["metadata"]["uid"] # checking on [58] references + bibliography_item_uuid = self._get_by_tree_path(tree, "0.12.57")["metadata"]["uid"] section = self._get_by_tree_path(tree, "0.4.0") bibliography_refs_in_text = [ann for ann in section["annotations"] if ann["name"] == "reference" and ann["value"] == bibliography_item_uuid] - # We must found two refs [58] in Introduction section - self.assertEqual(len(bibliography_refs_in_text), 2) - self.assertEqual(["58,", "58,"], [section["text"][bibliography_refs_in_text[n]["start"]:bibliography_refs_in_text[n]["end"]] for n in range(2)]) + # We must found ref [59] in Introduction section + self.assertEqual(len(bibliography_refs_in_text), 1) + self.assertEqual("59]", section["text"][bibliography_refs_in_text[0]["start"]:bibliography_refs_in_text[0]["end"]]) # check tables self.assertEqual(len(result["content"]["tables"]), 2) diff --git a/tests/api_tests/test_api_format_docx.py b/tests/api_tests/test_api_format_docx.py index 0728d14c..01bb0738 100644 --- a/tests/api_tests/test_api_format_docx.py +++ b/tests/api_tests/test_api_format_docx.py @@ -92,7 +92,25 @@ def test_not_stripped_xml(self) -> None: self._send_request("not_stripped_xml.docx", expected_code=200) def test_docx_with_comments(self) -> None: - _ = self._send_request("with_comments.docx", expected_code=200) + content = self._send_request("with_comments.docx")["content"] + structure = content["structure"] + + node = get_by_tree_path(structure, "0.0.0") + annotations = [ann for ann in node["annotations"] if ann["name"] == "linked_text"] + self.assertEqual(len(annotations), 2) + self.assertIn("Interesting entity type", annotations[0]["value"]) + self.assertIn("Some reply", annotations[0]["value"]) + self.assertIn("New comment", annotations[1]["value"]) + + node = get_by_tree_path(structure, "0.0.1.6") + annotations = [ann for ann in node["annotations"] if ann["name"] == "linked_text"] + self.assertEqual(len(annotations), 1) + self.assertIn("Примечание об организации", annotations[0]["value"]) + + cell_node = content["tables"][1]["cells"][1][0]["lines"][0] + annotations = [ann for ann in cell_node["annotations"] if ann["name"] == "linked_text"] + self.assertEqual(len(annotations), 1) + self.assertIn("Примечание о методе LSTM", annotations[0]["value"]) def test_return_html(self) -> None: file_name = "example.doc" diff --git a/tests/api_tests/test_api_format_excel.py b/tests/api_tests/test_api_format_excel.py index 6e82923f..1e5c75f1 100644 --- a/tests/api_tests/test_api_format_excel.py +++ b/tests/api_tests/test_api_format_excel.py @@ -37,6 +37,22 @@ def test_xls(self) -> None: tables = result["content"]["tables"] self.__check_content(tables) + def test_xlsx_comments(self) -> None: + file_name = "with_comments.xlsx" + tables = self._send_request(file_name)["content"]["tables"] + + self.__check_cell_comment(tables, 0, 1, 2, "Примечание что телефон указан не верно") + self.__check_cell_comment(tables, 0, 4, 0, "Заметка об организации") + self.__check_cell_comment(tables, 1, 0, 1, "Неточное название столбца") + self.__check_cell_comment(tables, 1, 5, 0, "Примечание о персоне Иванов Сергей") + self.__check_cell_comment(tables, 1, 9, 2, "Номер телефона под вопросом") + + def __check_cell_comment(self, tables: dict, table_id: int, row_id: int, col_id: int, text: str) -> None: + cell_node = tables[table_id]["cells"][row_id][col_id]["lines"][0] + annotations = [ann for ann in cell_node["annotations"] if ann["name"] == "linked_text"] + self.assertEqual(len(annotations), 1) + self.assertIn(text, annotations[0]["value"]) + def test_ods_formulas(self) -> None: file_name = "example_formulas.ods" result = self._send_request(file_name) diff --git a/tests/data/docx/with_comments.docx b/tests/data/docx/with_comments.docx index 23dad5b3..0ee218d0 100644 Binary files a/tests/data/docx/with_comments.docx and b/tests/data/docx/with_comments.docx differ diff --git a/tests/data/xlsx/with_comments.xlsx b/tests/data/xlsx/with_comments.xlsx new file mode 100644 index 00000000..9615f670 Binary files /dev/null and b/tests/data/xlsx/with_comments.xlsx differ