Skip to content

Commit e48acb1

Browse files
new version 2.6.1 (#551)
Co-authored-by: Maxim Deryugin <maxfelmosmartin@gmail.com>
1 parent 67b0d42 commit e48acb1

File tree

25 files changed

+110
-94
lines changed

25 files changed

+110
-94
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ dedoc/version.py
1111

1212
# Distribution / packaging
1313
.Python
14+
etc/
1415
env/
1516
build/
1617
develop-eggs/

Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ ENV RESOURCES_PATH "/dedoc_root/resources"
99
COPY requirements.txt .
1010
RUN pip3 install --no-cache-dir -r requirements.txt
1111
RUN apt-get update && apt-get install -y --fix-missing --no-install-recommends fontforge
12+
RUN apt install -y libutf8proc-dev
13+
RUN ln -s /usr/lib/x86_64-linux-gnu/libutf8proc.so /usr/lib/libutf8proc.so.1
1214

1315
RUN mkdir /dedoc_root
1416
RUN mkdir /dedoc_root/dedoc

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.6
1+
2.6.1

dedoc/extensions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@
6363
eml_like_format={".eml"},
6464
mhtml_like_format={".mhtml", ".mht", ".mhtml.gz", ".mht.gz"},
6565
archive_like_format={".zip", ".tar", ".tar.gz", ".rar", ".7z"},
66-
image_like_format={".png"},
66+
image_like_format={".png", ".jpg", ".jpeg", ".tiff", ".tif"},
6767
pdf_like_format={".pdf"},
6868
csv_like_format={".csv", ".tsv"},
6969
txt_like_format={".txt", ".txt.gz"},

dedoc/readers/archive_reader/archive_reader.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -85,14 +85,19 @@ def __read_rar_archive(self, path: str, tmp_dir: str, need_content_analysis: boo
8585
yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis)
8686

8787
def __read_7z_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]:
88-
import py7zlib
89-
90-
with open(path, "rb") as content:
91-
arch_file = py7zlib.Archive7z(content)
92-
names = arch_file.getnames()
93-
for name in names:
94-
file = arch_file.getmember(name)
95-
yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis)
88+
import os
89+
import py7zr
90+
import tempfile
91+
92+
with tempfile.TemporaryDirectory() as tmpdir:
93+
with py7zr.SevenZipFile(path, "r") as arch_file:
94+
arch_file.extractall(tmpdir)
95+
96+
for dir_path, _, file_names in os.walk(tmpdir):
97+
for file_name in file_names:
98+
file_path = os.path.join(dir_path, file_name)
99+
with open(file_path, "rb") as file:
100+
yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=file_name, file=file, need_content_analysis=need_content_analysis)
96101

97102
def __save_archive_file(self, tmp_dir: str, file_name: str, file: IO[bytes], need_content_analysis: bool) -> AttachedFile:
98103
import os

dedoc/readers/docx_reader/numbering_extractor.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,10 @@ def parse(self, xml: Tag, paragraph_properties: BaseProperties, run_properties:
6767
else:
6868
ilvl = ilvl["w:val"]
6969

70-
lvl_info: LevelInfo = self.num_dict[num_id].level_number2level_info[ilvl]
70+
try:
71+
lvl_info: LevelInfo = self.num_dict[num_id].level_number2level_info[ilvl]
72+
except KeyError:
73+
return
7174
text = self.__get_list_item_text(ilvl, num_id)
7275

7376
# change style of the paragraph/run: style -> pPr -> rPr

dedoc/readers/docx_reader/styles_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def __get_styles_hierarchy(self, style: Tag, style_id: str, style_type: StyleTyp
9999

100100
styles = [style]
101101
current_style = style
102-
while current_style.basedOn:
102+
while current_style and current_style.basedOn:
103103
try:
104104
parent_style_id = current_style.basedOn["w:val"]
105105
current_style = self.__find_style(parent_style_id, style_type)

dedoc/readers/pdf_reader/data_classes/tables/table_type.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
class TableTypeAdditionalOptions:
22
"""
3-
Setting up the table recognizer. The value of the parameter specifies the type of tables recognized when processed by
3+
Enum for table types of tables for the table recognizer.
4+
The value of the parameter specifies the type of tables recognized when processed by
45
class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer`.
56
6-
* Parameter `table_type=wo_external_bounds` - recognize tables without external bounds;
7+
* Parameter `table_type=wo_external_bounds` - recognize tables without external bounds.
78
89
Example of a table of type `wo_external_bounds`::
910
@@ -16,7 +17,7 @@ class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_
1617
text | text | text
1718
1819
19-
* Parameter `table_type=one_cell_table` - if a document contains a bounding box with text, it will be considered a table;
20+
* Parameter `table_type=one_cell_table` - if a document contains a bounding box with text, it will be considered a table.
2021
2122
Example of a page with a table of type `one_cell_table`::
2223
@@ -27,7 +28,7 @@ class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_
2728
+------+
2829
________________________
2930
30-
* Parameter `table_type=split_last_column` - specified parameter for the merged last column of the table;
31+
* Parameter `table_type=split_last_column` - specified parameter for the merged last column of the table.
3132
3233
Example of a table of type `split_last_column`::
3334

dedoc/readers/pdf_reader/pdf_base_reader.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
1313
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
1414
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
15-
from dedoc.readers.pdf_reader.utils.header_footers_analysis import HeaderFooterDetector
15+
1616

1717
ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
1818
"is_one_column_document",
@@ -44,6 +44,7 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
4444
from dedoc.readers.pdf_reader.pdf_image_reader.paragraph_extractor.scan_paragraph_classifier_extractor import ScanParagraphClassifierExtractor
4545
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer
4646
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer
47+
from dedoc.readers.pdf_reader.utils.header_footers_analysis import HeaderFooterDetector
4748
from dedoc.readers.pdf_reader.utils.line_object_linker import LineObjectLinker
4849
from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor
4950

dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,13 @@
2121

2222
class TableRecognizer:
2323
"""
24-
The class recognizes tables from document images. This class is internal to the system. It is called from readers such as .
25-
26-
* The class recognizes tables with borders from the document image and returns the class
27-
(function :meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.recognize_tables_from_image`);
28-
29-
30-
* The class also analyzes recognized single-page tables and combines them into multi-page ones
31-
(function :meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.convert_to_multipages_tables`);
24+
The class recognizes tables from document images. This class is internal to the system.
25+
It is called from readers such as :class:`dedoc.readers.PdfTxtlayerReader` or :class:`dedoc.readers.PdfImageReader`.
3226
27+
* The class recognizes tables with borders from the document image using
28+
:meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.recognize_tables_from_image`;
29+
* The class also analyzes recognized single-page tables and combines them into multi-page ones using
30+
:meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.convert_to_multipages_tables`
3331
"""
3432

3533
def __init__(self, *, config: dict = None) -> None:

0 commit comments

Comments
 (0)