Skip to content

Commit dc4cce0

Browse files
GPT-based, in-context field labeling
1 parent 62631ad commit dc4cce0

File tree

3 files changed

+279
-31
lines changed

3 files changed

+279
-31
lines changed

formfyxer/docx_wrangling.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -388,9 +388,11 @@ def get_modified_docx_runs(
388388
encoding = tiktoken.encoding_for_model("gpt-4")
389389
token_count = len(encoding.encode(role_description + docx_repr))
390390

391-
if token_count > 128000:
391+
# Updated limits for GPT-5-nano (400K input tokens) while maintaining GPT-4 compatibility
392+
max_tokens = 350000 # Conservative limit allowing for response tokens
393+
if token_count > max_tokens:
392394
raise Exception(
393-
f"Input to OpenAI is too long ({token_count} tokens). Maximum is 128000 tokens."
395+
f"Input to OpenAI is too long ({token_count} tokens). Maximum is {max_tokens} tokens."
394396
)
395397

396398
moderation_response = openai_client.moderations.create(
@@ -407,7 +409,7 @@ def get_modified_docx_runs(
407409
],
408410
response_format={"type": "json_object"},
409411
temperature=temperature,
410-
max_tokens=4096,
412+
max_tokens=16384, # Increased for more detailed responses
411413
top_p=1,
412414
frequency_penalty=0,
413415
presence_penalty=0,

formfyxer/lit_explorer.py

Lines changed: 226 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22
import re
33
import subprocess
4+
import tempfile
45

56
from pdfminer.high_level import extract_text
67
from pdfminer.layout import LAParams
@@ -433,6 +434,159 @@ def normalize_name(
433434
return reformat_field(this_field, tools_token=tools_token), 0.5
434435

435436

437+
def rename_pdf_fields_with_context(
438+
pdf_path: str,
439+
original_field_names: List[str],
440+
openai_creds: Optional[OpenAiCreds] = None,
441+
api_key: Optional[str] = None,
442+
model: str = "gpt-5-nano",
443+
) -> Dict[str, str]:
444+
"""
445+
Use LLM to rename PDF fields based on full PDF context with field markers.
446+
447+
Args:
448+
pdf_path: Path to the PDF file
449+
original_field_names: List of original field names from the PDF
450+
openai_creds: OpenAI credentials to use for the API call
451+
api_key: explicit API key to use (overrides creds and env vars)
452+
model: the OpenAI model to use (default: gpt-5-nano)
453+
454+
Returns:
455+
Dictionary mapping original field names to new Assembly Line names
456+
"""
457+
if not original_field_names:
458+
return {}
459+
460+
try:
461+
# Import here to avoid circular imports
462+
from .pdf_wrangling import get_original_text_with_fields
463+
464+
# Get PDF text with field markers
465+
with tempfile.NamedTemporaryFile(mode='w+', suffix='.txt', delete=False) as temp_file:
466+
try:
467+
get_original_text_with_fields(pdf_path, temp_file.name)
468+
469+
# Read the text with field markers
470+
with open(temp_file.name, 'r', encoding='utf-8') as f:
471+
pdf_text_with_fields = f.read()
472+
473+
finally:
474+
# Clean up temp file
475+
try:
476+
os.unlink(temp_file.name)
477+
except:
478+
pass
479+
480+
if not pdf_text_with_fields or not pdf_text_with_fields.strip():
481+
# Fallback: if we can't get text with field markers, use basic approach
482+
print("Warning: Could not extract PDF text with field markers, falling back to regex approach")
483+
return {name: regex_norm_field(re_case(name)) for name in original_field_names}
484+
485+
# Load the field labeling prompt
486+
system_message = _load_prompt("field_labeling")
487+
488+
# For GPT-5-nano: Support up to 30 pages (roughly 100K tokens input, well within 400K limit)
489+
# Estimate: 30 pages * ~1300 tokens/page = ~39K tokens for PDF text
490+
# Plus prompt and field list = ~50K total input tokens (comfortable margin)
491+
max_pdf_text_chars = 300000 # Roughly 75K tokens worth of text
492+
493+
user_message = f"""Here is the PDF form text with field markers:
494+
495+
{pdf_text_with_fields[:max_pdf_text_chars]}
496+
497+
Original field names to rename:
498+
{json.dumps(original_field_names, indent=2)}
499+
500+
Please analyze the context around each field marker and provide appropriate Assembly Line variable names."""
501+
502+
# Call the LLM with much higher limits for GPT-5-nano
503+
response = text_complete(
504+
system_message=system_message,
505+
user_message=user_message,
506+
max_tokens=15000, # Increased for larger field lists and more detailed reasoning
507+
creds=openai_creds,
508+
api_key=api_key,
509+
model=model,
510+
)
511+
512+
# Parse the response
513+
if isinstance(response, dict):
514+
field_mappings = response.get("field_mappings", {})
515+
elif isinstance(response, str):
516+
try:
517+
parsed_response = json.loads(response)
518+
field_mappings = parsed_response.get("field_mappings", {})
519+
except json.JSONDecodeError:
520+
raise ValueError(f"Failed to parse JSON response: {response}")
521+
else:
522+
raise ValueError(f"Unexpected response type: {type(response)}")
523+
524+
# Validate the response
525+
if not isinstance(field_mappings, dict):
526+
raise ValueError("field_mappings is not a dictionary")
527+
528+
# Ensure all original fields are mapped
529+
mapped_fields = set(field_mappings.keys())
530+
original_fields = set(original_field_names)
531+
532+
missing_fields = original_fields - mapped_fields
533+
extra_fields = mapped_fields - original_fields
534+
535+
# Handle missing fields with fallback
536+
for missing_field in missing_fields:
537+
fallback_name = regex_norm_field(re_case(missing_field))
538+
field_mappings[missing_field] = fallback_name
539+
print(f"Warning: LLM didn't map '{missing_field}', using fallback: '{fallback_name}'")
540+
541+
# Remove extra fields that weren't in the original list
542+
for extra_field in extra_fields:
543+
del field_mappings[extra_field]
544+
print(f"Warning: LLM provided mapping for unknown field '{extra_field}', removing")
545+
546+
# Handle duplicates by adding suffixes
547+
final_mappings = {}
548+
used_names = set()
549+
550+
for original_name in original_field_names:
551+
new_name = field_mappings.get(original_name, original_name)
552+
553+
# If this name is already used, add a suffix
554+
if new_name in used_names:
555+
counter = 2
556+
base_name = new_name
557+
while f"{base_name}__{counter}" in used_names:
558+
counter += 1
559+
new_name = f"{base_name}__{counter}"
560+
561+
final_mappings[original_name] = new_name
562+
used_names.add(new_name)
563+
564+
return final_mappings
565+
566+
except Exception as ex:
567+
print(f"Failed to rename fields with LLM: {ex}")
568+
569+
# Fallback: use regex-based approach
570+
fallback_mappings = {}
571+
used_names = set()
572+
573+
for original_name in original_field_names:
574+
new_name = regex_norm_field(re_case(original_name))
575+
576+
# Handle duplicates
577+
if new_name in used_names:
578+
counter = 2
579+
base_name = new_name
580+
while f"{base_name}__{counter}" in used_names:
581+
counter += 1
582+
new_name = f"{base_name}__{counter}"
583+
584+
fallback_mappings[original_name] = new_name
585+
used_names.add(new_name)
586+
587+
return fallback_mappings
588+
589+
436590
# Take a list of AL variables and spits out suggested groupings. Here's what's going on:
437591
#
438592
# 1. It reads in a list of fields (e.g., `["user_name","user_address"]`)
@@ -964,8 +1118,9 @@ def text_complete(
9641118

9651119
# GPT-5 models use max_completion_tokens instead of max_tokens and need more tokens due to reasoning
9661120
if model.startswith("gpt-5"):
967-
# Increase tokens significantly for GPT-5 models to account for reasoning tokens
968-
completion_params["max_completion_tokens"] = max_tokens * 10
1121+
# Increase tokens for GPT-5 models but respect the 128K completion token limit
1122+
requested_tokens = min(max_tokens * 5, 128000) # 5x multiplier but capped at 128K
1123+
completion_params["max_completion_tokens"] = requested_tokens
9691124
else:
9701125
completion_params["max_tokens"] = max_tokens
9711126
completion_params["temperature"] = temperature
@@ -1011,15 +1166,18 @@ def complete_with_command(
10111166
api_key: Optional[str] = None,
10121167
) -> str:
10131168
"""Combines some text with a command to send to open ai."""
1014-
# OpenAI's max number of tokens length is 4097, so we trim the input text to 4080 - command - tokens length.
1015-
# A bit less than 4097 in case the tokenizer is wrong
1016-
# don't deal with negative numbers, clip at 1 (OpenAi will error anyway)
1017-
max_length = max(4080 - len(tokenizer(command)["input_ids"]) - tokens, 1)
1169+
# For GPT-5-nano: 400K input token limit, so we can handle much larger inputs
1170+
# Support up to 30 pages of PDF text (~300K characters = ~75K tokens)
1171+
# Reserve space for command and response tokens: 375K - command - response = ~300K for text
1172+
max_input_tokens = 300000 # Conservative limit for input text
1173+
max_length = max(max_input_tokens - len(tokenizer(command)["input_ids"]) - tokens, 1)
1174+
10181175
text_tokens = tokenizer(text)
10191176
if len(text_tokens["input_ids"]) > max_length:
10201177
text = tokenizer.decode(
10211178
tokenizer(text, truncation=True, max_length=max_length)["input_ids"]
10221179
)
1180+
10231181
result = text_complete(
10241182
system_message=command,
10251183
user_message=text,
@@ -1050,7 +1208,7 @@ def describe_form(
10501208
text, creds: Optional[OpenAiCreds] = None, api_key: Optional[str] = None
10511209
) -> str:
10521210
command = _load_prompt("describe_form")
1053-
return complete_with_command(text, command, 1000, creds=creds, api_key=api_key)
1211+
return complete_with_command(text, command, 3000, creds=creds, api_key=api_key) # Increased for more detailed descriptions
10541212

10551213

10561214
def needs_calculations(text: str) -> bool:
@@ -1416,27 +1574,67 @@ def parse_form(
14161574
title = "(Untitled)"
14171575
nsmi = spot(title + ". " + text, token=spot_token) if spot_token else []
14181576
if normalize:
1419-
length = len(field_names)
1420-
last = "null"
1421-
new_names = []
1422-
new_names_conf = []
1423-
for i, field_name in enumerate(field_names):
1424-
new_name, new_confidence = normalize_name(
1425-
jur or "",
1426-
cat or "",
1427-
i,
1428-
i / length,
1429-
last,
1430-
field_name,
1431-
tools_token=tools_token,
1432-
)
1433-
new_names.append(new_name)
1434-
new_names_conf.append(new_confidence)
1435-
last = field_name
1436-
new_names = [
1437-
v + "__" + str(new_names[:i].count(v) + 1) if new_names.count(v) > 1 else v
1438-
for i, v in enumerate(new_names)
1439-
]
1577+
# Use enhanced LLM-powered field renaming with PDF context
1578+
if (openai_creds or resolved_api_key) and field_names:
1579+
try:
1580+
field_mappings = rename_pdf_fields_with_context(
1581+
in_file,
1582+
field_names,
1583+
openai_creds=openai_creds,
1584+
api_key=resolved_api_key,
1585+
)
1586+
new_names = [field_mappings.get(name, name) or name for name in field_names]
1587+
# Set high confidence for LLM-generated names
1588+
new_names_conf = [0.8 if field_mappings.get(name) else 0.1 for name in field_names]
1589+
llm_renamed_count = len([n for n in new_names if n and not n.startswith('*')])
1590+
print(f"Successfully renamed {llm_renamed_count} fields using LLM")
1591+
except Exception as e:
1592+
print(f"LLM field renaming failed: {e}, falling back to traditional approach")
1593+
# Fallback to traditional approach
1594+
length = len(field_names)
1595+
last = "null"
1596+
new_names = []
1597+
new_names_conf = []
1598+
for i, field_name in enumerate(field_names):
1599+
new_name, new_confidence = normalize_name(
1600+
jur or "",
1601+
cat or "",
1602+
i,
1603+
i / length,
1604+
last,
1605+
field_name,
1606+
tools_token=tools_token,
1607+
)
1608+
new_names.append(new_name)
1609+
new_names_conf.append(new_confidence)
1610+
last = field_name
1611+
new_names = [
1612+
v + "__" + str(new_names[:i].count(v) + 1) if new_names.count(v) > 1 else v
1613+
for i, v in enumerate(new_names)
1614+
]
1615+
else:
1616+
# Traditional approach when no OpenAI credentials available
1617+
length = len(field_names)
1618+
last = "null"
1619+
new_names = []
1620+
new_names_conf = []
1621+
for i, field_name in enumerate(field_names):
1622+
new_name, new_confidence = normalize_name(
1623+
jur or "",
1624+
cat or "",
1625+
i,
1626+
i / length,
1627+
last,
1628+
field_name,
1629+
tools_token=tools_token,
1630+
)
1631+
new_names.append(new_name)
1632+
new_names_conf.append(new_confidence)
1633+
last = field_name
1634+
new_names = [
1635+
v + "__" + str(new_names[:i].count(v) + 1) if new_names.count(v) > 1 else v
1636+
for i, v in enumerate(new_names)
1637+
]
14401638
else:
14411639
new_names = field_names
14421640
new_names_conf = []
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
You are an expert in the Document Assembly Line framework for Docassemble, with deep knowledge of PDF-specific variable naming conventions. Your task is to analyze PDF form text with field markers and assign proper field labels following Assembly Line conventions.
2+
3+
You will receive text extracted from a PDF form where each fillable field is marked as {{current_field_name}}. Your job is to rename these fields with appropriate Assembly Line variable names.
4+
5+
Follow these rules carefully:
6+
7+
1. Use snake_case for labels. All lowercase, with words separated by underscores. Labels must start with a letter and contain only letters, digits, and underscores.
8+
2. Use descriptive but concise names. Aim for fewer than 30 characters. Remove filler words like "the" or "a."
9+
3. Use predefined Assembly Line variable names whenever possible. If no predefined name fits, create a custom one that follows the same naming rules.
10+
4. If the same logical field appears more than once, append `__2`, `__3`, etc. to subsequent occurrences (keep the first occurrence without suffix).
11+
5. Use singular person identifiers with digits only when the PDF has multiple parties (e.g., `users1_name`, `users2_name`). If there is only one party, you can use `users_name`.
12+
6. Apply special suffixes when appropriate:
13+
- `_date` for date fields (e.g., `incident_date`)
14+
- `_amount` or `_value` for currency fields (e.g., `rent_amount`, `house_value`)
15+
- `_yes` and `_no` for paired yes/no checkboxes (e.g., `is_minor_yes`, `is_minor_no`)
16+
7. Use the appropriate role identifier for the person or entity the field belongs to. Choose the identifier that best matches the role in the form.
17+
18+
Valid role identifiers (PDF prefix options):
19+
users, other_parties, plaintiffs, defendants, petitioners, respondents, children, spouses, parents, caregivers, attorneys, translators, debt_collectors, creditors, witnesses, guardians_ad_litem, guardians, decedents, interested_parties
20+
21+
Standard Assembly Line field examples:
22+
- users_name, users1_name, defendants1_name, plaintiffs1_name
23+
- users1_name_first, users1_name_last, users1_name_middle
24+
- users1_address_address, users1_address_city, users1_address_state, users1_address_zip
25+
- users1_address_block (for multiline address fields)
26+
- users1_phone_number, users1_email, users1_birthdate
27+
- users1_signature, signature_date
28+
- docket_number, trial_court, trial_court_county, trial_court_division
29+
30+
Analyze the context around each {{field_name}} marker to determine the appropriate label based on:
31+
- The text immediately before and after the field marker
32+
- The overall context and purpose of the form
33+
- The role of the person who would fill out that field
34+
- The type of information being requested
35+
36+
Return a JSON object with the following structure:
37+
{
38+
"field_mappings": {
39+
"original_field_name_1": "new_assembly_line_name_1",
40+
"original_field_name_2": "new_assembly_line_name_2"
41+
},
42+
"reasoning": {
43+
"original_field_name_1": "Brief explanation of why this name was chosen",
44+
"original_field_name_2": "Brief explanation of why this name was chosen"
45+
}
46+
}
47+
48+
Ensure every original field name from the input text is mapped to exactly one new name in the field_mappings object.

0 commit comments

Comments
 (0)