new version 2.6.1 (#551)

NastyBoget · hellhoundisalab · web-flow · commit e48acb17fdab · 2025-12-16T14:50:23.000+03:00
Co-authored-by: Maxim Deryugin &lt;maxfelmosmartin@gmail.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@ dedoc/version.py
 
 # Distribution / packaging
 .Python
+etc/
 env/
 build/
 develop-eggs/
diff --git a/Dockerfile b/Dockerfile
@@ -9,6 +9,8 @@ ENV RESOURCES_PATH "/dedoc_root/resources"
 COPY requirements.txt .
 RUN pip3 install --no-cache-dir -r requirements.txt
 RUN apt-get update && apt-get install -y --fix-missing --no-install-recommends fontforge
+RUN apt install -y libutf8proc-dev
+RUN ln -s /usr/lib/x86_64-linux-gnu/libutf8proc.so /usr/lib/libutf8proc.so.1
 
 RUN mkdir /dedoc_root
 RUN mkdir /dedoc_root/dedoc
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.6
+2.6.1
diff --git a/dedoc/extensions.py b/dedoc/extensions.py
@@ -63,7 +63,7 @@
     eml_like_format={".eml"},
     mhtml_like_format={".mhtml", ".mht", ".mhtml.gz", ".mht.gz"},
     archive_like_format={".zip", ".tar", ".tar.gz", ".rar", ".7z"},
-    image_like_format={".png"},
+    image_like_format={".png", ".jpg", ".jpeg", ".tiff", ".tif"},
     pdf_like_format={".pdf"},
     csv_like_format={".csv", ".tsv"},
     txt_like_format={".txt", ".txt.gz"},
diff --git a/dedoc/readers/archive_reader/archive_reader.py b/dedoc/readers/archive_reader/archive_reader.py
@@ -85,14 +85,19 @@ def __read_rar_archive(self, path: str, tmp_dir: str, need_content_analysis: boo
                     yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis)
 
     def __read_7z_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]:
-        import py7zlib
-
-        with open(path, "rb") as content:
-            arch_file = py7zlib.Archive7z(content)
-            names = arch_file.getnames()
-            for name in names:
-                file = arch_file.getmember(name)
-                yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis)
+        import os
+        import py7zr
+        import tempfile
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with py7zr.SevenZipFile(path, "r") as arch_file:
+                arch_file.extractall(tmpdir)
+
+            for dir_path, _, file_names in os.walk(tmpdir):
+                for file_name in file_names:
+                    file_path = os.path.join(dir_path, file_name)
+                    with open(file_path, "rb") as file:
+                        yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=file_name, file=file, need_content_analysis=need_content_analysis)
 
     def __save_archive_file(self, tmp_dir: str, file_name: str, file: IO[bytes], need_content_analysis: bool) -> AttachedFile:
         import os
diff --git a/dedoc/readers/docx_reader/numbering_extractor.py b/dedoc/readers/docx_reader/numbering_extractor.py
@@ -67,7 +67,10 @@ def parse(self, xml: Tag, paragraph_properties: BaseProperties, run_properties:
         else:
             ilvl = ilvl["w:val"]
 
-        lvl_info: LevelInfo = self.num_dict[num_id].level_number2level_info[ilvl]
+        try:
+            lvl_info: LevelInfo = self.num_dict[num_id].level_number2level_info[ilvl]
+        except KeyError:
+            return
         text = self.__get_list_item_text(ilvl, num_id)
 
         # change style of the paragraph/run: style -> pPr -> rPr
diff --git a/dedoc/readers/docx_reader/styles_extractor.py b/dedoc/readers/docx_reader/styles_extractor.py
@@ -99,7 +99,7 @@ def __get_styles_hierarchy(self, style: Tag, style_id: str, style_type: StyleTyp
 
         styles = [style]
         current_style = style
-        while current_style.basedOn:
+        while current_style and current_style.basedOn:
             try:
                 parent_style_id = current_style.basedOn["w:val"]
                 current_style = self.__find_style(parent_style_id, style_type)
diff --git a/dedoc/readers/pdf_reader/data_classes/tables/table_type.py b/dedoc/readers/pdf_reader/data_classes/tables/table_type.py
@@ -1,9 +1,10 @@
 class TableTypeAdditionalOptions:
     """
-    Setting up the table recognizer. The value of the parameter specifies the type of tables recognized when processed by
+    Enum for table types of tables for the table recognizer.
+    The value of the parameter specifies the type of tables recognized when processed by
     class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer`.
 
-    * Parameter `table_type=wo_external_bounds` - recognize tables without external bounds;
+    * Parameter `table_type=wo_external_bounds` - recognize tables without external bounds.
 
     Example of a table of type `wo_external_bounds`::
 
@@ -16,7 +17,7 @@ class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_
        text   | text | text
 
 
-    * Parameter `table_type=one_cell_table` - if a document contains a bounding box with text, it will be considered a table;
+    * Parameter `table_type=one_cell_table` - if a document contains a bounding box with text, it will be considered a table.
 
     Example of a page with a table of type `one_cell_table`::
 
@@ -27,7 +28,7 @@ class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_
                         +------+
          ________________________
 
-    * Parameter `table_type=split_last_column` - specified parameter for the merged last column of the table;
+    * Parameter `table_type=split_last_column` - specified parameter for the merged last column of the table.
 
     Example of a table of type `split_last_column`::
 
diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -12,7 +12,7 @@
 from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
 from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
 from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
-from dedoc.readers.pdf_reader.utils.header_footers_analysis import HeaderFooterDetector
+
 
 ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
     "is_one_column_document",
@@ -44,6 +44,7 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
         from dedoc.readers.pdf_reader.pdf_image_reader.paragraph_extractor.scan_paragraph_classifier_extractor import ScanParagraphClassifierExtractor
         from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer
         from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer
+        from dedoc.readers.pdf_reader.utils.header_footers_analysis import HeaderFooterDetector
         from dedoc.readers.pdf_reader.utils.line_object_linker import LineObjectLinker
         from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor
 
diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py
@@ -21,15 +21,13 @@
 
 class TableRecognizer:
     """
-    The class recognizes tables from document images. This class is internal to the system. It is called from readers such as .
-
-    * The class recognizes tables with borders from the document image and returns the class
-        (function :meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.recognize_tables_from_image`);
-
-
-    * The class also analyzes recognized single-page tables and combines them into multi-page ones
-        (function :meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.convert_to_multipages_tables`);
+    The class recognizes tables from document images. This class is internal to the system.
+    It is called from readers such as :class:`dedoc.readers.PdfTxtlayerReader` or :class:`dedoc.readers.PdfImageReader`.
 
+    * The class recognizes tables with borders from the document image using
+      :meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.recognize_tables_from_image`;
+    * The class also analyzes recognized single-page tables and combines them into multi-page ones using
+      :meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.convert_to_multipages_tables`
     """
 
     def __init__(self, *, config: dict = None) -> None:
diff --git a/dedoc/readers/pdf_reader/utils/header_footers_analysis.py b/dedoc/readers/pdf_reader/utils/header_footers_analysis.py
@@ -16,18 +16,18 @@ class HeaderFooterDetector:
     `Lin X. Header and footer extraction by page association //Document Recognition and Retrieval X. – SPIE, 2003. – Т. 5010. – С. 164-171.`
 
     Algorithm's notes:
-        1) For documents of 6 pages or more, lines on even and odd pages of the document are compared to detect alternating footers-headers.
-        For documents of less than 6 pages, lines between adjacent pages (between even or odd pages) are compared.
-        Therefore, alternating footers-headers will not be detected on documents of less than 6 pages.
 
-        2) The algorithm analyzes the first 4 and last 4 lines on each page of the document and,
-        by comparing lines across pages, identifies common footer-header patterns using Levenshtein similarity.
+        1. For documents of 6 pages or more, lines on even and odd pages of the document are compared to detect alternating footers-headers.
+           For documents of less than 6 pages, lines between adjacent pages (between even or odd pages) are compared.
+           Therefore, alternating footers-headers will not be detected on documents of less than 6 pages.
 
-        3) For the algorithm to work, the document must have at least two pages of text.
-        It is not an ML algorithm it cannot work with just one page.
+        2. The algorithm analyzes the first 4 and last 4 lines on each page of the document and,
+           by comparing lines across pages, identifies common footer-header patterns using Levenshtein similarity.
 
-        4) The more pages the better. Remember the parameter `pages` limits the number of pages in a document.
+        3. For algorithm work, the document must have at least two pages of text.
+           It is not an ML algorithm so it cannot work with just one page.
 
+        4. The more pages, the better. Remember that the parameter `pages` limits the number of pages in a document.
     """
 
     def __init__(self) -> None:
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -1,28 +1,33 @@
 Changelog
 =========
+
+v2.6.1 (2025-12-16)
+-------------------
+Release note: `v2.6.1 <https://github.com/ispras/dedoc/releases/tag/v2.6.1>`_
+
+* Fixed some bugs in `DocxReader`.
+* Replace outdated `pylzma` dependency by `py7zr`.
+
 v2.6 (2025-09-19)
 -----------------
 Release note: `v2.6 <https://github.com/ispras/dedoc/releases/tag/v2.6>`_
 
-* improve table merge algorithm (added check on table layout) `MultiPageTableExtractor`.
-* refactoring table merge `MultiPageTableExtractor`.
-* improve header footer analysis `HeaderFooterDetector`.
-* added header footer analysis support in Tabby.
-* added header footer analysis info (parameter `need_header_footer_analysis`) in documentation (readthedocs).
-* update to python3.10.
-* update to ubuntu22.04.
-* added `Contributing Information` (project rules, how build, how develop) in documentation (readthedocs).
-
+* Improved table merge algorithm (added check on table layout) `MultiPageTableExtractor`.
+* Improved header footer analysis `HeaderFooterDetector`.
+* Added header footer analysis support in `PdfTabbyReader`.
+* Added header footer analysis info (parameter `need_header_footer_analysis`) in documentation.
+* Updated to python3.10.
+* Updated to ubuntu22.04.
+* Added `Support and Contributing` (project rules, how to build, how to develop) in documentation.
 
 v2.5 (2025-09-05)
 -----------------
 Release note: `v2.5 <https://github.com/ispras/dedoc/releases/tag/v2.5>`_
 
 * Added simple multilingual textual layer correctness classification based on letter percentage calculation (`textual_layer_classifier=letter`).
 * Added a new parameter `textual_layer_classifier = [simple, ml (default), letter]`.
-* Remove parameter `fast_textual_layer_detection`. Now it is a `textual_layer_classifier=simple`.
-* Fix bug with `table_type=table_wo_external_bounds` (fixed cv2.BoundingRect).
-* Some refactoring `TableRecognition`.
+* Removed parameter `fast_textual_layer_detection`. Now it is a `textual_layer_classifier=simple`.
+* Fixed bug with `table_type=table_wo_external_bounds` (fixed cv2.BoundingRect).
 * Added parameter `table_type` and `TableRecognition` info into documentation.
 
 v2.4 (2025-07-28)
diff --git a/docs/source/contributing/check_documentation.rst b/docs/source/contributing/check_documentation.rst
@@ -9,12 +9,11 @@ Check documentation
 
          pip install .[docs]
 
-2. Documentation files should be located in the `docs/ <https://github.com/ispras/dedoc/blob/master/docs>`_ directory,
-   which must contain the `docs/source/conf.py <https://github.com/ispras/dedoc/blob/master/docs/source/conf.py>`_ (build settings)
-   and `docs/source/index.rst <https://github.com/ispras/dedoc/blob/master/docs/source/index.rst>`_ (documentation main page) files.
-
-3. Build documentation into HTML pages is done as follows:
+2. Documentation files should be located in the `docs/ <https://github.com/ispras/dedoc/blob/master/docs>`_ directory.
+   Build documentation into HTML pages is done as follows:
 
     .. code-block:: bash
 
          python -m sphinx -T -E -W -b html -d docs/_build/doctrees -D language=en docs/source docs/_build
+
+3. After building, the documentation can be checked locally, the main built page ``docs/_build/index.html`` can be opened in the browser.
diff --git a/docs/source/contributing/contributing.rst b/docs/source/contributing/contributing.rst
@@ -5,12 +5,12 @@ Support and Contributing
 
 Support
 -------
-If you are stuck with a problem using Dedoc, please do get in touch at our `Issues <https://github.com/ispras/dedoc/issues>`_ (recommend)
+If you are stuck with a problem using Dedoc, please use our `Issues <https://github.com/ispras/dedoc/issues>`_ (recommended)
 or `Dedoc Chat <https://t.me/dedoc_chat>`_. The developers are willing to help.
 
 You can save time by following this procedure when reporting a problem:
 
-    * Do try to solve the problem on your own first. Read the documentation, including using the search feature, index and reference documentation.
+    * Try to solve the problem on your own first. Read the documentation, including using the search feature, index and reference documentation.
 
     * Search the issue archives to see if someone else already had the same problem.
 
@@ -23,7 +23,9 @@ Contributing Rules
 
     * To add new features to the project repository yourself, you should follow
       the `general contributing rules of github <https://github.com/firstcontributions/first-contributions>`_.
-      In your Pull Request, set `develop` as the target branch.
+
+      .. note::
+          In your Pull Request, set `develop` as the target branch.
 
     * We recommend using `Pycharm IDE` and `virtualenv` package for development.
 
@@ -34,28 +36,26 @@ Contributing Rules
     * We strongly recommend using the already used ML library `torch` in development. For example,
       using `tensorflow` library instead of `torch` is justified only in case of extreme necessity.
 
-    * If you add new functionality to dedoc, be sure to add python `unitests` to test the added functionality
-      (you can add api tests in `tests/api_tests <https://github.com/ispras/dedoc/blob/master/tests/api_tests>`_,
-      you can add unit tests in `tests/unit_tests <https://github.com/ispras/dedoc/blob/master/tests/unit_tests>`_).
+    * If you add new functionality to dedoc, be sure to add python `unittest` to test the added functionality
+      (you can add api tests in `tests/api_tests <https://github.com/ispras/dedoc/blob/master/tests/api_tests>`_
+      or unit tests in `tests/unit_tests <https://github.com/ispras/dedoc/blob/master/tests/unit_tests>`_).
       These tests are run automatically in the Continuous Integration pipeline.
+      To run tests locally, you can use docker as described in the `README <https://github.com/ispras/dedoc/blob/master/README.md#4-run-container-with-tests>`_.
 
     * Before each commit, check the code style using the automatic checker using the `flake8` library.
-      Instructions for using flake8 are provided here :ref:using_flake8`.
+      Instructions for using flake8 are provided in :ref:`using_flake8`.
 
     * We recommend setting up pre-commit for convenience and speeding up development according to the instructions :ref:`using_precommit` .
-      This will run a style check of the changed code with each commit.
+      This will run a style check of the changed code before each commit.
 
     * In case of any change in the online documentation of the project (for example, when adding a new api parameter),
       be sure to check locally that the changed documentation is successfully built and looks as expected.
       Building online documentation using `sphinx` is described here :ref:`check_documentation`.
 
 .. toctree::
    :maxdepth: 1
+   :hidden:
 
    using_flake8
    using_precommit
    check_documentation
-
-
-
-
diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst
@@ -279,6 +279,16 @@ Api parameters description
       - false
       - This option is used to **remove** headers and footers of PDF documents from the output result.
         If ``need_header_footer_analysis=false``, header and footer lines will present in the output as well as all other document lines.
+        The algorithm is implemented and described in the class :class:`~dedoc.readers.pdf_reader.utils.header_footers_analysis.HeaderFooterDetector`.
+
+    * - table_type
+      - "", wo_external_bounds, one_cell_table, split_last_column and their combinaton
+      - ""
+      - Setting up the table recognition method. This option is used for PDF documents which are images with text (PDF without a textual layer).
+        It is also used for PDF documents when ``pdf_with_text_layer`` is ``true``, ``false``, ``auto`` or ``auto_tabby``.
+        The value of the parameter specifies the type of tables for recognition,
+        supported table types are described in :class:`~dedoc.readers.pdf_reader.data_classes.tables.table_type.TableTypeAdditionalOptions`.
+        You can use combination of values (for example, ``wo_external_bounds+one_cell_table``).
 
     * - need_binarization
       - true, false
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
@@ -182,4 +182,4 @@ For ``python3.9``:
   .. code-block:: bash
 
     pip install https://github.com/ispras/dedockerfiles/raw/master/wheels/torch-1.11.0a0+git137096a-cp39-cp39-linux_x86_64.whl
-    pip install https://github.com/ispras/dedockerfiles/raw/master/wheels/torchvision-0.12.0a0%2B9b5a3fe-cp39-cp39-linux_x86_64.whl
+    pip install https://github.com/ispras/dedockerfiles/raw/master/wheels/torchvision-0.12.0a0%2B9b5a3fe-cp39-cp39-linux_x86_64.whl
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -278,6 +278,7 @@ This type of structure is configurable (see :ref:`using_patterns`).
    modules/metadata_extractors
    modules/structure_extractors
    modules/structure_constructors
+   modules/pdf_parsing
 
 
 .. toctree::
diff --git a/docs/source/modules/manager.rst b/docs/source/modules/manager.rst
@@ -10,15 +10,3 @@ Dedoc pipeline
 .. autoclass:: dedoc.attachments_handler.AttachmentsHandler
     :special-members: __init__
     :members:
-
-.. autoclass:: dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer
-    :show-inheritance:
-    :members:
-
-.. autoclass:: dedoc.readers.pdf_reader.data_classes.tables.table_type.TableTypeAdditionalOptions
-    :show-inheritance:
-    :members:
-
-.. autoclass:: dedoc.readers.pdf_reader.utils.header_footers_analysis.HeaderFooterDetector
-    :show-inheritance:
-    :members:
diff --git a/docs/source/modules/pdf_parsing.rst b/docs/source/modules/pdf_parsing.rst
@@ -0,0 +1,14 @@
+.. _pdf_parsing:
+
+Auxiliary data structures for PDF and images parsing
+====================================================
+
+
+.. autoclass:: dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer
+    :members:
+
+.. autoclass:: dedoc.readers.pdf_reader.data_classes.tables.table_type.TableTypeAdditionalOptions
+    :members:
+
+.. autoclass:: dedoc.readers.pdf_reader.utils.header_footers_analysis.HeaderFooterDetector
+    :members:
diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst
diff --git a/labeling/Dockerfile b/labeling/Dockerfile
diff --git a/requirements.txt b/requirements.txt
diff --git a/tests/Dockerfile b/tests/Dockerfile
diff --git a/tests/data/with_attachments/attachments.7z b/tests/data/with_attachments/attachments.7z
diff --git a/tests/unit_tests/test_module_attachment_extractor.py b/tests/unit_tests/test_module_attachment_extractor.py