From cb8a9d7754bd903eba074a00c24990523075e61f Mon Sep 17 00:00:00 2001 From: Bryce Willey Date: Fri, 27 Oct 2023 14:55:34 -0400 Subject: [PATCH 1/3] Trying to get context around PDF fields Harder than anticipated, for a few reasons: * can't get things in order. I guess that's the point of PDFminer, but... * PDF miner doesn't give you AcroForms at all. It has a completley hardcoded way of getting them, outside the context of the page. * We can do these two things: * kinda put all of the fields back in the original text (see replace_original_text). Doesn't work too well though, lots of duplicate pieces of text that put many of the fields in the same place, when they should be in a different place. * could gather fields with the same adjacent text, and get all parts of that text in the PDF. Not guaranteed to be in order tho. * for each field, get all of the surrounding context. Is okay! But consistently gets too much text for GPT4. Even if we make it smaller, sometimes the surrounding context isn't the full sentence, or gets too much from other fields (will have too much shared / confusing the two fields). TBH next goal is to try the PDFPageAndFieldInterpreter approach, notes in there. --- formfyxer/pdf_wrangling.py | 178 +++++++++++++++++++++++++++++++------ 1 file changed, 151 insertions(+), 27 deletions(-) diff --git a/formfyxer/pdf_wrangling.py b/formfyxer/pdf_wrangling.py index 8ce7f32..6616938 100644 --- a/formfyxer/pdf_wrangling.py +++ b/formfyxer/pdf_wrangling.py @@ -6,6 +6,7 @@ from copy import copy from typing import ( Any, + Callable, Dict, Iterable, Optional, @@ -30,8 +31,22 @@ from reportlab.pdfgen import canvas from reportlab.lib.colors import magenta, pink, blue -from pdfminer.converter import PDFLayoutAnalyzer -from pdfminer.layout import LAParams, LTPage, LTTextBoxHorizontal, LTChar, LTContainer +from pdfminer.converter import PDFLayoutAnalyzer, TextConverter +from pdfminer.layout import ( + LAParams, + LTPage, + LTTextBoxHorizontal, + LTChar, + LTContainer, + LTAnno, + LTText, + LTTextBox, + LTTextBoxVertical, + LTTextGroup, + LTTextLine, + LTImage, + LTItem, +) from pdfminer.pdffont import PDFUnicodeNotDefined from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter @@ -689,6 +704,33 @@ def get_result(self) -> List[LTPage]: return self.results +class PDFPageAndFieldInterpreter(PDFPageInterpreter): + # TODO: keep track of all of the fields per page, insert them when rendering the page + pass + + +class TextAndFieldConverter(TextConverter): + def receive_layout(self, ltpage: LTPage) -> None: + def render(item: LTItem) -> None: + if isinstance(item, LTContainer): + for child in item: + render(child) + elif isinstance(item, LTText): + self.write_text(item.get_text()) + if isinstance(item, LTTextBox): + self.write_text("\n") + elif isinstance(item, LTImage): + if self.imagewriter is not None: + self.imagewriter.export_image(item) + elif isinstance(item, LTAnnot): + self.write_text(item.get_text()) + + if self.showpageno: + self.write_text("Page %s\n" % ltpage.pageid) + render(ltpage) + self.write_text("\f") + + class Textbox(TypedDict): textbox: LTTextBoxHorizontal bbox: BoundingBoxF @@ -1039,11 +1081,115 @@ def get_possible_fields( return fields +class ImproveNameVisitor: + def __init__(self): + self.used_field_names = set() + + def improve_name_with_surrounding_text( + self, field_info: FormField, textboxes: List[Textbox] + ) -> FormField: + dists = [ + ( + bbox_distance(field_info.get_bbox(), textbox["bbox"])[0], + textbox["textbox"], + textbox["bbox"], + ) + for textbox in textboxes + ] + if DEBUG: + print(f"For {field_info.name}, dists: {dists}") + min_textbox = min(dists, key=lambda d: d[0]) + # TODO(brycew): remove the text boxes if they intersect something, unlikely they are the label for more than one. + # text_obj_bboxes.remove(min_obj[2]) + # TODO(brycew): actual regex replacement of lots of underscores + label = re.sub("[\W]", "_", min_textbox[1].get_text().lower().strip(" \n\t_,.")) + label = re.sub("_{3,}", "_", label).strip("_") + if label not in self.used_field_names: + field_info.name = label + self.used_field_names.add(label) + elif DEBUG: + print(f"avoiding using label {label} more than once") + return field_info + + +class AllCloseTextVisitor: + def __init__(self): + self.field_map = {} + + def all_close_text(self, field_info, textboxes) -> FormField: + dists = [ + (tb["bbox"][0] + tb["bbox"][1] * 1000, tb["textbox"].get_text()) + for tb in textboxes + ] + [ + ( + field_info.get_bbox()[0] + field_info.get_bbox()[1] * 1000, + "{{ " + field_info.name + "}} ", + ) + ] + textbox_order = sorted(dists, key=lambda d: d[0]) + all_text = "".join([tb[1] for tb in textbox_order]) + self.field_map[field_info.name] = all_text + return field_info + + +class LowestVertVisitor: + """Gets just the closest text to the field, and returns that""" + + def __init__(self): + self.field_map = {} + + def lowest_vert(fi, tbs): + dists = [] + for tb in tbs: + dist = pdf_wrangling.bbox_distance(fi.get_bbox(), tb["bbox"]) + a_side, b_side = dist[1], dist[2] + closest_side_dist = min( + pdf_wrangling.get_dist(a_side[0], b_side[0]), + pdf_wrangling.get_dist(a_side[1], b_side[1]), + ) + enumm = ("After" if closest_side_dist > 0 else "Before",) + tup = (dist[0], enumm, tb["textbox"], tb["bbox"]) + dists.append(tup) + min_tb = min(dists, key=lambda d: d[0]) + print(f"{fi.name}, {min_tb[2].get_text()}") + self.field_map[fi.name] = min_tb + return fi + + +def replace_in_original(original_text, field_map): + """Given the original text of a PDF (extract_text(...)), adds the field's names in their best places. + Doesn't always work, especially with duplicate text. + """ + text = original_text + for field_info in field_map.items(): + try: + idx = text.index(field_info[1][2].get_text()) + print(f"{field_info[0]}, {idx}") + if field_info[1][1] == "Before": + text = text[:idx] + " {{ " + field_info[0] + " }} " + text[idx:] + else: + new_idx = idx + len(field_info[1][2].get_text()) + text = text[:new_idx] + " {{ " + field_info[0] + " }} " + text[new_idx:] + except Exception as ex: + print(f"EXCEPTION on {field_info[0]}: {ex}") + return text + + def improve_names_with_surrounding_text( fields: List[List[FormField]], textboxes: List[List[Textbox]] -): +) -> List[List[FormField]]: + name_visitor = ImproveNameVisitor() + return surrounding_text_traverse( + fields, + textboxes, + lambda fi, tbs: name_visitor.improve_name_with_surrounding_text(fi, tbs), + ) + + +def surrounding_text_traverse( + fields: List[List[FormField]], textboxes: List[List[Textbox]], visitor: Callable +) -> List[List[FormField]]: new_fields = [] - used_field_names = set() for i, (fields_in_page, text_in_page) in enumerate(zip(fields, textboxes)): # Get text boxes with more than one character (not including spaces, _, etc.) text_in_page = [ @@ -1071,29 +1217,7 @@ def improve_names_with_surrounding_text( if intersect ] if intersected: - dists = [ - ( - bbox_distance(field_bbox, textbox["bbox"])[0], - textbox["textbox"], - textbox["bbox"], - ) - for textbox in intersected - ] - if DEBUG: - print(f"For {field_info.name}, dists: {dists}") - min_textbox = min(dists, key=lambda d: d[0]) - # TODO(brycew): remove the text boxes if they intersect something, unlikely they are the label for more than one. - # text_obj_bboxes.remove(min_obj[2]) - # TODO(brycew): actual regex replacement of lots of underscores - label = re.sub( - "[\W]", "_", min_textbox[1].get_text().lower().strip(" \n\t_,.") - ) - label = re.sub("_{3,}", "_", label).strip("_") - if label not in used_field_names: - copied_field_info.name = label - used_field_names.add(label) - elif DEBUG: - print(f"avoiding using label {label} more than once") + copied_field_info = visitor(copied_field_info, intersected) page_fields.append(copied_field_info) new_fields.append(page_fields) From db6a496b18f0f49200ec52eedcf7f3399e79941e Mon Sep 17 00:00:00 2001 From: Bryce Willey Date: Tue, 31 Oct 2023 16:27:30 -0400 Subject: [PATCH 2/3] Got PDF fields in the text of the PDF working --- formfyxer/pdf_wrangling.py | 139 +++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) diff --git a/formfyxer/pdf_wrangling.py b/formfyxer/pdf_wrangling.py index 6616938..d9140d9 100644 --- a/formfyxer/pdf_wrangling.py +++ b/formfyxer/pdf_wrangling.py @@ -52,6 +52,10 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFParser +from pdfminer.pdfdevice import PDFDevice +from pdfminer.pdftypes import resolve1 +from pdfminer.psparser import PSLiteral, PSKeyword +from pdfminer.utils import decode_text, translate_matrix, mult_matrix, MATRIX_IDENTITY # Change this to true to output lots of images to help understand why a kernel didn't work DEBUG = False @@ -704,10 +708,145 @@ def get_result(self) -> List[LTPage]: return self.results +class JinjaFieldTextConverter(TextConverter): + def render_char( + self, + matrix, + font, + fontsize: float, + scaling: float, + rise: float, + cid: int, + ncs, + graphicstate, + ) -> float: + try: + text = font.to_unichr(cid) + assert isinstance(text, str), str(type(text)) + except PDFUnicodeNotDefined: + text = self.handle_undefined_char(font, cid) + textwidth = font.char_width(cid) + textdisp = font.char_disp(cid) + # Some fonts don't have "{", "}", or "_". Use the right sizes for them, + # otherwise they won't get combined into the correct lines + if textwidth == 0 and cid == 123 or cid == 125: # "{" or "}" + textwidth = font.char_width(116) # about the size of a "t" + if textwidth == 0 and cid == 95: # "_" + textwidth = font.char_width(77) # about the size of a "M" + item = LTChar( + matrix, + font, + fontsize, + scaling, + rise, + text, + textwidth, + textdisp, + ncs, + graphicstate, + ) + self.cur_item.add(item) + return item.adv + + class PDFPageAndFieldInterpreter(PDFPageInterpreter): # TODO: keep track of all of the fields per page, insert them when rendering the page pass + def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice, doc) -> None: + self.rsrcmgr = rsrcmgr + self.device = device + self.doc = doc + self.field_pages = {} + existing_fields = get_existing_pdf_fields(doc) + + for page_fields, page in zip(existing_fields, doc.pages): + objid = page.obj.objgen[0] + self.field_pages[objid] = [] + for field in page_fields: + self.field_pages[objid].append(field) + + def dup(self) -> "PDFPageInterpreter": + return self.__class__(self.rsrcmgr, self.device, self.doc) + + def get_fields_on_page(self, page_id): + return self.field_pages.get(page_id, []) + + def process_page(self, page) -> None: + (x0, y0, x1, y1) = page.mediabox + if page.rotate == 90: + ctm = (0, -1, 1, 0, -y0, x1) + elif page.rotate == 180: + ctm = (-1, 0, 0, -1, x1, y1) + elif page.rotate == 270: + ctm = (0, 1, -1, 0, y1, -x0) + else: + ctm = (1, 0, 0, 1, -x0, -y0) + self.device.begin_page(page, ctm) + + self.render_contents(page.resources, page.contents, ctm=ctm) + # Render all of the fields on the page as {{ field_name }} + # print(page.pageid) + for field in self.get_fields_on_page(page.pageid): + self.do_BT() + # set the font, and the font size. Get any font available + font = list(self.fontmap.values())[-1] + for contender_font in self.fontmap.values(): + if contender_font.is_vertical(): + continue + # Make sure that there's widths for A and a + if ( + contender_font.char_width(65) == 0 + or contender_font.char_width(97) == 0 + ): + continue + font = contender_font + self.textstate.fontsize = 8 + x = 0 + y = 0 + needcharspace = False + # Start a specific position on the page (field.x and field.y) + self.do_TD(field.x, field.y) + matrix = mult_matrix(self.textstate.matrix, ctm) + # print(f"{field.get('T')}, {matrix}") + # Manual Tj operation + for char in r"{{" + field.name + r"}}": + for cid in font.decode(char.encode()): + if needcharspace: + x += 0.1 # charspace + # print(x, cid, font.char_width(cid)) + x += self.device.render_char( + translate_matrix(matrix, (x, y)), + font, + self.textstate.fontsize, # fontsize, + 1.0, # scaling, + 0, + cid, + self.ncs, + self.graphicstate.copy(), + ) + if cid == 32 and wordspace: + x += 0 # wordspace + needcharspace = True + self.do_ET() + self.device.end_page(page) + return + + +def get_original_text_with_fields(input_file, output_file): + """Gets the original text of the document, with the names of the fields in jinja format ({{field_name}})""" + with open(input_file, "rb") as fp, open(input_file, "rb") as dup_fp, open( + output_file, "wb" + ) as output_string: + rsrcmgr = PDFResourceManager() + device = JinjaFieldTextConverter( + rsrcmgr, output_string, codec="utf-8", laparams=LAParams(char_margin=10.0) + ) + interpreter = PDFPageAndFieldInterpreter(rsrcmgr, device, Pdf.open(dup_fp)) + for page in PDFPage.get_pages(fp, False): + interpreter.process_page(page) + device.close() + class TextAndFieldConverter(TextConverter): def receive_layout(self, ltpage: LTPage) -> None: From 78cdfdeac971c1fbf6abdb9c77170222229cba0d Mon Sep 17 00:00:00 2001 From: Bryce Willey Date: Sun, 16 Jun 2024 22:14:37 -0400 Subject: [PATCH 3/3] Mypy fixes --- formfyxer/pdf_wrangling.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/formfyxer/pdf_wrangling.py b/formfyxer/pdf_wrangling.py index d9140d9..dca26ac 100644 --- a/formfyxer/pdf_wrangling.py +++ b/formfyxer/pdf_wrangling.py @@ -757,7 +757,7 @@ def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice, doc) -> None: self.rsrcmgr = rsrcmgr self.device = device self.doc = doc - self.field_pages = {} + self.field_pages: Dict[Any, List[FormField]] = {} existing_fields = get_existing_pdf_fields(doc) for page_fields, page in zip(existing_fields, doc.pages): @@ -802,20 +802,18 @@ def process_page(self, page) -> None: continue font = contender_font self.textstate.fontsize = 8 - x = 0 - y = 0 + x = 0.0 + y = 0.0 needcharspace = False # Start a specific position on the page (field.x and field.y) self.do_TD(field.x, field.y) matrix = mult_matrix(self.textstate.matrix, ctm) - # print(f"{field.get('T')}, {matrix}") # Manual Tj operation for char in r"{{" + field.name + r"}}": for cid in font.decode(char.encode()): if needcharspace: x += 0.1 # charspace - # print(x, cid, font.char_width(cid)) - x += self.device.render_char( + x += self.device.render_char( # type: ignore translate_matrix(matrix, (x, y)), font, self.textstate.fontsize, # fontsize, @@ -825,8 +823,8 @@ def process_page(self, page) -> None: self.ncs, self.graphicstate.copy(), ) - if cid == 32 and wordspace: - x += 0 # wordspace + # if cid == 32 and wordspace: + # x += 0 # wordspace needcharspace = True self.do_ET() self.device.end_page(page) @@ -861,7 +859,7 @@ def render(item: LTItem) -> None: elif isinstance(item, LTImage): if self.imagewriter is not None: self.imagewriter.export_image(item) - elif isinstance(item, LTAnnot): + elif isinstance(item, LTAnno): self.write_text(item.get_text()) if self.showpageno: