1
1
import re
2
- from statistics import mean
3
2
4
3
import pdfplumber
5
4
from pdfplumber .ctm import CTM
@@ -38,6 +37,8 @@ def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str:
38
37
Using pdf plumber extract out the text of the document that is not
39
38
skewed (ie a stamp of approval) and extract out text removing blue text
40
39
40
+ Strip margin refers only to top and bottom margin here
41
+
41
42
:param page: PdfPlumber page
42
43
:param strip_margin: a flag to crop out the margin of a document and skewed content
43
44
:return: Text from the pdf plumber page
@@ -47,24 +48,24 @@ def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str:
47
48
_ , _ , width , height = page .bbox
48
49
pixels_per_inch = width / 8.5
49
50
bbox = (
50
- pixels_per_inch * 1 , # 1 inch from left edge
51
+ 0 ,
51
52
pixels_per_inch * 1 , # 1 inch down from top
52
- pixels_per_inch
53
- * 7.5 , # 7.5 inches from left edge (1 inch from right)
53
+ width , #
54
54
pixels_per_inch * 10 , # 10 inches from top (1 inch from bottom)
55
55
)
56
- doc_text = (
56
+ page_text = (
57
57
page .crop (bbox )
58
58
.filter (is_skewed )
59
59
.extract_text (
60
60
layout = True , keep_blank_chars = True , y_tolerance = 5 , y_density = 25
61
61
)
62
62
)
63
63
else :
64
- doc_text = page .extract_text (
64
+ page_text = page .extract_text (
65
65
layout = True , keep_blank_chars = True , y_tolerance = 5 , y_density = 25
66
66
)
67
- return doc_text
67
+ page_text = remove_excess_whitespace (page_text )
68
+ return page_text
68
69
69
70
70
71
def has_images (page : pdfplumber .pdf .Page ) -> bool :
@@ -126,6 +127,7 @@ def adjust_caption_lines(page_text: str) -> str:
126
127
row = row .replace (f" { separator } " , f"{ addition } { separator } " )
127
128
page .append (row )
128
129
return "\n " .join (page )
130
+ return page_text
129
131
130
132
131
133
def page_needs_ocr (page : pdfplumber .pdf .Page , page_text : str ) -> bool :
0 commit comments