Skip to content

Commit 0c6d537

Browse files
committed
Change default behaviour to do not parse math
1 parent e9af307 commit 0c6d537

File tree

6 files changed

+30
-25
lines changed

6 files changed

+30
-25
lines changed

README.md

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
<h1 align="center">PDF Document Layout Analysis</h1>
32
<p align="center">A Docker-powered microservice for intelligent PDF document layout analysis, OCR, and content extraction</p>
43

@@ -155,7 +154,7 @@ The service provides a comprehensive RESTful API with the following endpoints:
155154

156155
| Endpoint | Method | Description | Parameters |
157156
|----------|--------|-------------|------------|
158-
| `/` | POST | Analyze PDF layout and extract segments | `file`, `fast`, `ocr_tables` |
157+
| `/` | POST | Analyze PDF layout and extract segments | `file`, `fast`, `parse_tables_and_math` |
159158
| `/save_xml/{filename}` | POST | Analyze PDF and save XML output | `file`, `xml_file_name`, `fast` |
160159
| `/get_xml/{filename}` | GET | Retrieve saved XML analysis | `xml_file_name` |
161160

@@ -188,7 +187,7 @@ The service provides a comprehensive RESTful API with the following endpoints:
188187

189188
- **`file`**: PDF file to process (multipart/form-data)
190189
- **`fast`**: Use LightGBM models instead of VGT (boolean, default: false)
191-
- **`ocr_tables`**: Apply OCR to table regions (boolean, default: false)
190+
- **`parse_tables_and_math`**: Apply OCR to table regions (boolean, default: false) and convert formulas to LaTeX
192191
- **`language`**: OCR language code (string, default: "en")
193192
- **`types`**: Comma-separated content types to extract (string, default: "all")
194193
- **`extract_toc`**: Include table of contents at the beginning of the output (boolean, default: false)
@@ -213,11 +212,11 @@ curl -X POST \
213212
http://localhost:5060
214213
```
215214

216-
**Analysis with table OCR:**
215+
**Analysis with table and math parsing:**
217216
```bash
218217
curl -X POST \
219218
-F 'file=@document.pdf' \
220-
-F 'ocr_tables=true' \
219+
-F 'parse_tables_and_math=true' \
221220
http://localhost:5060
222221
```
223222

@@ -680,10 +679,10 @@ For segments without text (e.g., images):
680679

681680
#### Enhanced Table Extraction
682681

683-
OCR tables and extract them in HTML format by setting `ocr_tables=true`:
682+
Parse tables and extract them in HTML format by setting `parse_tables_and_math=true`:
684683

685684
```bash
686-
curl -X POST -F 'file=@document.pdf' -F 'ocr_tables=true' http://localhost:5060
685+
curl -X POST -F 'file=@document.pdf' -F 'parse_tables_and_math=true' http://localhost:5060
687686
```
688687

689688

@@ -909,4 +908,3 @@ We welcome contributions to improve the PDF Document Layout Analysis service!
909908
### License
910909

911910
This project is licensed under the terms specified in the [LICENSE](LICENSE) file.
912-

src/adapters/infrastructure/pdf_analysis_service_adapter.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def __init__(
2222
self.file_repository = file_repository
2323

2424
def analyze_pdf_layout(
25-
self, pdf_content: AnyStr, xml_filename: str = "", ocr_tables: bool = False, keep_pdf: bool = False
25+
self, pdf_content: AnyStr, xml_filename: str = "", parse_tables_and_math: bool = False, keep_pdf: bool = False
2626
) -> list[dict]:
2727
pdf_path = self.file_repository.save_pdf(pdf_content)
2828
service_logger.info("Creating PDF images")
@@ -31,9 +31,9 @@ def analyze_pdf_layout(
3131

3232
predicted_segments = self.vgt_model_service.predict_document_layout(pdf_images_list)
3333

34-
pdf_images_200_dpi = PdfImages.from_pdf_path(pdf_path, "", xml_filename, dpi=200)
35-
self.format_conversion_service.convert_formula_to_latex(pdf_images_200_dpi, predicted_segments)
36-
if ocr_tables:
34+
if parse_tables_and_math:
35+
pdf_images_200_dpi = PdfImages.from_pdf_path(pdf_path, "", xml_filename, dpi=200)
36+
self.format_conversion_service.convert_formula_to_latex(pdf_images_200_dpi, predicted_segments)
3737
self.format_conversion_service.convert_table_to_html(pdf_images_200_dpi, predicted_segments)
3838

3939
if not keep_pdf:
@@ -45,7 +45,7 @@ def analyze_pdf_layout(
4545
]
4646

4747
def analyze_pdf_layout_fast(
48-
self, pdf_content: AnyStr, xml_filename: str = "", ocr_tables: bool = False, keep_pdf: bool = False
48+
self, pdf_content: AnyStr, xml_filename: str = "", parse_tables_and_math: bool = False, keep_pdf: bool = False
4949
) -> list[dict]:
5050
pdf_path = self.file_repository.save_pdf(pdf_content)
5151
service_logger.info("Creating PDF images for fast analysis")
@@ -54,9 +54,9 @@ def analyze_pdf_layout_fast(
5454

5555
predicted_segments = self.fast_model_service.predict_layout_fast(pdf_images_list)
5656

57-
pdf_images_200_dpi = PdfImages.from_pdf_path(pdf_path, "", xml_filename, dpi=200)
58-
self.format_conversion_service.convert_formula_to_latex(pdf_images_200_dpi, predicted_segments)
59-
if ocr_tables:
57+
if parse_tables_and_math:
58+
pdf_images_200_dpi = PdfImages.from_pdf_path(pdf_path, "", xml_filename, dpi=200)
59+
self.format_conversion_service.convert_formula_to_latex(pdf_images_200_dpi, predicted_segments)
6060
self.format_conversion_service.convert_table_to_html(pdf_images_list[0], predicted_segments)
6161

6262
if not keep_pdf:

src/adapters/web/fastapi_controllers.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,12 @@ async def info(self):
5151
async def error(self):
5252
raise FileNotFoundError("This is a test error from the error endpoint")
5353

54-
async def analyze_pdf(self, file: UploadFile = File(...), fast: bool = Form(False), ocr_tables: bool = Form(False)):
55-
return await run_in_threadpool(self.analyze_pdf_use_case.execute, file.file.read(), "", ocr_tables, fast, False)
54+
async def analyze_pdf(
55+
self, file: UploadFile = File(...), fast: bool = Form(False), parse_tables_and_math: bool = Form(False)
56+
):
57+
return await run_in_threadpool(
58+
self.analyze_pdf_use_case.execute, file.file.read(), "", parse_tables_and_math, fast, False
59+
)
5660

5761
async def analyze_and_save_xml(
5862
self, file: UploadFile = File(...), xml_file_name: str | None = None, fast: bool = Form(False)

src/ports/services/pdf_analysis_service.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@
55
class PDFAnalysisService(ABC):
66
@abstractmethod
77
def analyze_pdf_layout(
8-
self, pdf_content: AnyStr, xml_filename: str = "", ocr_tables: bool = False, keep_pdf: bool = False
8+
self, pdf_content: AnyStr, xml_filename: str = "", parse_tables_and_math: bool = False, keep_pdf: bool = False
99
) -> list[dict]:
1010
pass
1111

1212
@abstractmethod
1313
def analyze_pdf_layout_fast(
14-
self, pdf_content: AnyStr, xml_filename: str = "", ocr_tables: bool = False, keep_pdf: bool = False
14+
self, pdf_content: AnyStr, xml_filename: str = "", parse_tables_and_math: bool = False, keep_pdf: bool = False
1515
) -> list[dict]:
1616
pass

src/tests/test_end_to_end.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ def test_text_extraction_fast(self):
242242
def test_table_extraction(self):
243243
with open(f"{ROOT_PATH}/test_pdfs/table.pdf", "rb") as stream:
244244
files = {"file": stream}
245-
data = {"ocr_tables": "true"}
245+
data = {"parse_tables_and_math": "true"}
246246

247247
response = requests.post(f"{self.service_url}", files=files, data=data)
248248

@@ -268,8 +268,9 @@ def test_table_extraction(self):
268268
def test_formula_extraction(self):
269269
with open(f"{ROOT_PATH}/test_pdfs/formula.pdf", "rb") as stream:
270270
files = {"file": stream}
271+
data = {"parse_tables_and_math": "true"}
271272

272-
response = requests.post(f"{self.service_url}", files=files)
273+
response = requests.post(f"{self.service_url}", files=files, data=data)
273274

274275
response_json = response.json()
275276
formula_text = response_json[1]["text"]

src/use_cases/pdf_analysis/analyze_pdf_use_case.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,16 @@ def execute(
1616
self,
1717
pdf_content: AnyStr,
1818
xml_filename: str = "",
19-
ocr_tables: bool = False,
19+
parse_tables_and_math: bool = False,
2020
use_fast_mode: bool = False,
2121
keep_pdf: bool = False,
2222
) -> list[dict]:
2323
if use_fast_mode:
24-
return self.pdf_analysis_service.analyze_pdf_layout_fast(pdf_content, xml_filename, ocr_tables, keep_pdf)
24+
return self.pdf_analysis_service.analyze_pdf_layout_fast(
25+
pdf_content, xml_filename, parse_tables_and_math, keep_pdf
26+
)
2527
else:
26-
return self.pdf_analysis_service.analyze_pdf_layout(pdf_content, xml_filename, ocr_tables, keep_pdf)
28+
return self.pdf_analysis_service.analyze_pdf_layout(pdf_content, xml_filename, parse_tables_and_math, keep_pdf)
2729

2830
def execute_and_save_xml(self, pdf_content: AnyStr, xml_filename: str, use_fast_mode: bool = False) -> list[dict]:
2931
result = self.execute(pdf_content, xml_filename, False, use_fast_mode, keep_pdf=False)

0 commit comments

Comments
 (0)