Skip to content

Commit 4009f00

Browse files
authored
Merge pull request #191 from freelawproject/doctor-update-recap-extract
Doctor Fix
2 parents 5f30530 + 9774695 commit 4009f00

File tree

1 file changed

+9
-7
lines changed

1 file changed

+9
-7
lines changed

doctor/lib/text_extraction.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import re
2-
from statistics import mean
32

43
import pdfplumber
54
from pdfplumber.ctm import CTM
@@ -38,6 +37,8 @@ def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str:
3837
Using pdf plumber extract out the text of the document that is not
3938
skewed (ie a stamp of approval) and extract out text removing blue text
4039
40+
Strip margin refers only to top and bottom margin here
41+
4142
:param page: PdfPlumber page
4243
:param strip_margin: a flag to crop out the margin of a document and skewed content
4344
:return: Text from the pdf plumber page
@@ -47,24 +48,24 @@ def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str:
4748
_, _, width, height = page.bbox
4849
pixels_per_inch = width / 8.5
4950
bbox = (
50-
pixels_per_inch * 1, # 1 inch from left edge
51+
0,
5152
pixels_per_inch * 1, # 1 inch down from top
52-
pixels_per_inch
53-
* 7.5, # 7.5 inches from left edge (1 inch from right)
53+
width, #
5454
pixels_per_inch * 10, # 10 inches from top (1 inch from bottom)
5555
)
56-
doc_text = (
56+
page_text = (
5757
page.crop(bbox)
5858
.filter(is_skewed)
5959
.extract_text(
6060
layout=True, keep_blank_chars=True, y_tolerance=5, y_density=25
6161
)
6262
)
6363
else:
64-
doc_text = page.extract_text(
64+
page_text = page.extract_text(
6565
layout=True, keep_blank_chars=True, y_tolerance=5, y_density=25
6666
)
67-
return doc_text
67+
page_text = remove_excess_whitespace(page_text)
68+
return page_text
6869

6970

7071
def has_images(page: pdfplumber.pdf.Page) -> bool:
@@ -126,6 +127,7 @@ def adjust_caption_lines(page_text: str) -> str:
126127
row = row.replace(f" {separator}", f"{addition}{separator}")
127128
page.append(row)
128129
return "\n".join(page)
130+
return page_text
129131

130132

131133
def page_needs_ocr(page: pdfplumber.pdf.Page, page_text: str) -> bool:

0 commit comments

Comments
 (0)