Change default behaviour to do not parse math

gabriel-piles · gabriel-piles · commit 0c6d5377cea3 · 2025-08-21T12:01:00.000+02:00
diff --git a/README.md b/README.md
@@ -1,4 +1,3 @@
-
 <h1 align="center">PDF Document Layout Analysis</h1>
 <p align="center">A Docker-powered microservice for intelligent PDF document layout analysis, OCR, and content extraction</p>
 
@@ -155,7 +154,7 @@ The service provides a comprehensive RESTful API with the following endpoints:
 
 | Endpoint | Method | Description | Parameters |
 |----------|--------|-------------|------------|
-| `/` | POST | Analyze PDF layout and extract segments | `file`, `fast`, `ocr_tables` |
+| `/` | POST | Analyze PDF layout and extract segments | `file`, `fast`, `parse_tables_and_math` |
 | `/save_xml/{filename}` | POST | Analyze PDF and save XML output | `file`, `xml_file_name`, `fast` |
 | `/get_xml/{filename}` | GET | Retrieve saved XML analysis | `xml_file_name` |
 
@@ -188,7 +187,7 @@ The service provides a comprehensive RESTful API with the following endpoints:
 
 - **`file`**: PDF file to process (multipart/form-data)
 - **`fast`**: Use LightGBM models instead of VGT (boolean, default: false)
-- **`ocr_tables`**: Apply OCR to table regions (boolean, default: false)
+- **`parse_tables_and_math`**: Apply OCR to table regions (boolean, default: false) and convert formulas to LaTeX
 - **`language`**: OCR language code (string, default: "en")
 - **`types`**: Comma-separated content types to extract (string, default: "all")
 - **`extract_toc`**: Include table of contents at the beginning of the output (boolean, default: false)
@@ -213,11 +212,11 @@ curl -X POST \
   http://localhost:5060
 ```
 
-**Analysis with table OCR:**
+**Analysis with table and math parsing:**
 ```bash
 curl -X POST \
   -F 'file=@document.pdf' \
-  -F 'ocr_tables=true' \
+  -F 'parse_tables_and_math=true' \
   http://localhost:5060
 ```
 
@@ -680,10 +679,10 @@ For segments without text (e.g., images):
 
 #### Enhanced Table Extraction
 
-OCR tables and extract them in HTML format by setting `ocr_tables=true`:
+Parse tables and extract them in HTML format by setting `parse_tables_and_math=true`:
 
 ```bash
-curl -X POST -F 'file=@document.pdf' -F 'ocr_tables=true' http://localhost:5060
+curl -X POST -F 'file=@document.pdf' -F 'parse_tables_and_math=true' http://localhost:5060
 ```
 
 
@@ -909,4 +908,3 @@ We welcome contributions to improve the PDF Document Layout Analysis service!
 ### License
 
 This project is licensed under the terms specified in the [LICENSE](LICENSE) file.
-
diff --git a/src/adapters/infrastructure/pdf_analysis_service_adapter.py b/src/adapters/infrastructure/pdf_analysis_service_adapter.py
@@ -22,7 +22,7 @@ def __init__(
         self.file_repository = file_repository
 
     def analyze_pdf_layout(
-        self, pdf_content: AnyStr, xml_filename: str = "", ocr_tables: bool = False, keep_pdf: bool = False
+        self, pdf_content: AnyStr, xml_filename: str = "", parse_tables_and_math: bool = False, keep_pdf: bool = False
     ) -> list[dict]:
         pdf_path = self.file_repository.save_pdf(pdf_content)
         service_logger.info("Creating PDF images")
@@ -31,9 +31,9 @@ def analyze_pdf_layout(
 
         predicted_segments = self.vgt_model_service.predict_document_layout(pdf_images_list)
 
-        pdf_images_200_dpi = PdfImages.from_pdf_path(pdf_path, "", xml_filename, dpi=200)
-        self.format_conversion_service.convert_formula_to_latex(pdf_images_200_dpi, predicted_segments)
-        if ocr_tables:
+        if parse_tables_and_math:
+            pdf_images_200_dpi = PdfImages.from_pdf_path(pdf_path, "", xml_filename, dpi=200)
+            self.format_conversion_service.convert_formula_to_latex(pdf_images_200_dpi, predicted_segments)
             self.format_conversion_service.convert_table_to_html(pdf_images_200_dpi, predicted_segments)
 
         if not keep_pdf:
@@ -45,7 +45,7 @@ def analyze_pdf_layout(
         ]
 
     def analyze_pdf_layout_fast(
-        self, pdf_content: AnyStr, xml_filename: str = "", ocr_tables: bool = False, keep_pdf: bool = False
+        self, pdf_content: AnyStr, xml_filename: str = "", parse_tables_and_math: bool = False, keep_pdf: bool = False
     ) -> list[dict]:
         pdf_path = self.file_repository.save_pdf(pdf_content)
         service_logger.info("Creating PDF images for fast analysis")
@@ -54,9 +54,9 @@ def analyze_pdf_layout_fast(
 
         predicted_segments = self.fast_model_service.predict_layout_fast(pdf_images_list)
 
-        pdf_images_200_dpi = PdfImages.from_pdf_path(pdf_path, "", xml_filename, dpi=200)
-        self.format_conversion_service.convert_formula_to_latex(pdf_images_200_dpi, predicted_segments)
-        if ocr_tables:
+        if parse_tables_and_math:
+            pdf_images_200_dpi = PdfImages.from_pdf_path(pdf_path, "", xml_filename, dpi=200)
+            self.format_conversion_service.convert_formula_to_latex(pdf_images_200_dpi, predicted_segments)
             self.format_conversion_service.convert_table_to_html(pdf_images_list[0], predicted_segments)
 
         if not keep_pdf:
diff --git a/src/adapters/web/fastapi_controllers.py b/src/adapters/web/fastapi_controllers.py
@@ -51,8 +51,12 @@ async def info(self):
     async def error(self):
         raise FileNotFoundError("This is a test error from the error endpoint")
 
-    async def analyze_pdf(self, file: UploadFile = File(...), fast: bool = Form(False), ocr_tables: bool = Form(False)):
-        return await run_in_threadpool(self.analyze_pdf_use_case.execute, file.file.read(), "", ocr_tables, fast, False)
+    async def analyze_pdf(
+        self, file: UploadFile = File(...), fast: bool = Form(False), parse_tables_and_math: bool = Form(False)
+    ):
+        return await run_in_threadpool(
+            self.analyze_pdf_use_case.execute, file.file.read(), "", parse_tables_and_math, fast, False
+        )
 
     async def analyze_and_save_xml(
         self, file: UploadFile = File(...), xml_file_name: str | None = None, fast: bool = Form(False)
diff --git a/src/ports/services/pdf_analysis_service.py b/src/ports/services/pdf_analysis_service.py
@@ -5,12 +5,12 @@
 class PDFAnalysisService(ABC):
     @abstractmethod
     def analyze_pdf_layout(
-        self, pdf_content: AnyStr, xml_filename: str = "", ocr_tables: bool = False, keep_pdf: bool = False
+        self, pdf_content: AnyStr, xml_filename: str = "", parse_tables_and_math: bool = False, keep_pdf: bool = False
     ) -> list[dict]:
         pass
 
     @abstractmethod
     def analyze_pdf_layout_fast(
-        self, pdf_content: AnyStr, xml_filename: str = "", ocr_tables: bool = False, keep_pdf: bool = False
+        self, pdf_content: AnyStr, xml_filename: str = "", parse_tables_and_math: bool = False, keep_pdf: bool = False
     ) -> list[dict]:
         pass
diff --git a/src/tests/test_end_to_end.py b/src/tests/test_end_to_end.py
@@ -242,7 +242,7 @@ def test_text_extraction_fast(self):
     def test_table_extraction(self):
         with open(f"{ROOT_PATH}/test_pdfs/table.pdf", "rb") as stream:
             files = {"file": stream}
-            data = {"ocr_tables": "true"}
+            data = {"parse_tables_and_math": "true"}
 
             response = requests.post(f"{self.service_url}", files=files, data=data)
 
@@ -268,8 +268,9 @@ def test_table_extraction(self):
     def test_formula_extraction(self):
         with open(f"{ROOT_PATH}/test_pdfs/formula.pdf", "rb") as stream:
             files = {"file": stream}
+            data = {"parse_tables_and_math": "true"}
 
-            response = requests.post(f"{self.service_url}", files=files)
+            response = requests.post(f"{self.service_url}", files=files, data=data)
 
             response_json = response.json()
             formula_text = response_json[1]["text"]
diff --git a/src/use_cases/pdf_analysis/analyze_pdf_use_case.py b/src/use_cases/pdf_analysis/analyze_pdf_use_case.py
@@ -16,14 +16,16 @@ def execute(
         self,
         pdf_content: AnyStr,
         xml_filename: str = "",
-        ocr_tables: bool = False,
+        parse_tables_and_math: bool = False,
         use_fast_mode: bool = False,
         keep_pdf: bool = False,
     ) -> list[dict]:
         if use_fast_mode:
-            return self.pdf_analysis_service.analyze_pdf_layout_fast(pdf_content, xml_filename, ocr_tables, keep_pdf)
+            return self.pdf_analysis_service.analyze_pdf_layout_fast(
+                pdf_content, xml_filename, parse_tables_and_math, keep_pdf
+            )
         else:
-            return self.pdf_analysis_service.analyze_pdf_layout(pdf_content, xml_filename, ocr_tables, keep_pdf)
+            return self.pdf_analysis_service.analyze_pdf_layout(pdf_content, xml_filename, parse_tables_and_math, keep_pdf)
 
     def execute_and_save_xml(self, pdf_content: AnyStr, xml_filename: str, use_fast_mode: bool = False) -> list[dict]:
         result = self.execute(pdf_content, xml_filename, False, use_fast_mode, keep_pdf=False)