Skip to content

Commit bcc5bd5

Browse files
committed
Add pdf text position
1 parent 6e2d27d commit bcc5bd5

File tree

5 files changed

+39
-1
lines changed

5 files changed

+39
-1
lines changed

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,4 @@ latex2mathml==3.78.0
2626
PyMuPDF==1.25.5
2727
ollama==0.6.0
2828
cachetools==6.2.1
29-
git+https://github.com/huridocs/pdf-features.git@2025.10.13.3
29+
git+https://github.com/huridocs/pdf-features.git@2025.10.16.3

src/app.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import subprocess
1111
import json
1212

13+
from use_cases.pdf_analysis.get_pdf_word_positions import get_pdf_word_positions
14+
1315
if RESTART_IF_NO_GPU:
1416
if not torch.cuda.is_available():
1517
raise RuntimeError("No GPU available. Restarting the service is required.")
@@ -49,6 +51,12 @@ async def analyze_pdf(file: UploadFile = File(...), fast: bool = Form(False), pa
4951
)
5052

5153

54+
@app.post("/word_positions")
55+
@catch_exceptions
56+
async def word_positions(file: UploadFile = File(...)):
57+
return await run_in_threadpool(get_pdf_word_positions, file.file.read())
58+
59+
5260
@app.post("/save_xml/{xml_file_name}")
5361
@catch_exceptions
5462
async def analyze_and_save_xml(xml_file_name: str, file: UploadFile = File(...), fast: bool = Form(False)):

src/drivers/web/dependency_injection.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from adapters.infrastructure.html_conversion_service_adapter import HtmlConversionServiceAdapter
1212
from adapters.web.fastapi_controllers import FastAPIControllers
1313
from use_cases.pdf_analysis.analyze_pdf_use_case import AnalyzePDFUseCase
14+
from use_cases.pdf_analysis.get_pdf_word_positions import get_pdf_word_positions
1415
from use_cases.text_extraction.extract_text_use_case import ExtractTextUseCase
1516
from use_cases.toc_extraction.extract_toc_use_case import ExtractTOCUseCase
1617
from use_cases.visualization.create_visualization_use_case import CreateVisualizationUseCase

src/tests/test_end_to_end.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,19 @@ def test_toc(self):
199199
self.assertEqual(response_json[-1]["label"], "C. TITLE LONGER")
200200
self.assertEqual(response_json[-1]["indentation"], 2)
201201

202+
def test_word_positions(self):
203+
with open(f"{ROOT_PATH}/test_pdfs/regular.pdf", "rb") as stream:
204+
files = {"file": stream}
205+
206+
response = requests.post(f"{self.service_url}/word_positions", files=files)
207+
208+
response_json = response.json()
209+
self.assertEqual(response.status_code, 200)
210+
self.assertGreater(len(response_json), 50)
211+
212+
page_numbers = set(word["page_number"] for word in response_json)
213+
self.assertEqual(len(page_numbers), 2)
214+
202215
def test_toc_fast(self):
203216
with open(f"{ROOT_PATH}/test_pdfs/toc-test.pdf", "rb") as stream:
204217
files = {"file": stream}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import tempfile
2+
import os
3+
from pdf_features.PdfTextPosition import PdfTextPosition
4+
from pdf_features.PdfWord import PdfWord
5+
6+
7+
def get_pdf_word_positions(file_content: bytes) -> list[PdfWord]:
8+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
9+
tmp_file.write(file_content)
10+
tmp_path = tmp_file.name
11+
12+
try:
13+
pdf_text_position = PdfTextPosition(tmp_path)
14+
return pdf_text_position.get_all_pdf_words()
15+
finally:
16+
os.unlink(tmp_path)

0 commit comments

Comments
 (0)