|
1 | 1 | import os |
2 | 2 | import re |
3 | 3 | import subprocess |
| 4 | +import tempfile |
4 | 5 |
|
5 | 6 | from pdfminer.high_level import extract_text |
6 | 7 | from pdfminer.layout import LAParams |
@@ -433,6 +434,159 @@ def normalize_name( |
433 | 434 | return reformat_field(this_field, tools_token=tools_token), 0.5 |
434 | 435 |
|
435 | 436 |
|
| 437 | +def rename_pdf_fields_with_context( |
| 438 | + pdf_path: str, |
| 439 | + original_field_names: List[str], |
| 440 | + openai_creds: Optional[OpenAiCreds] = None, |
| 441 | + api_key: Optional[str] = None, |
| 442 | + model: str = "gpt-5-nano", |
| 443 | +) -> Dict[str, str]: |
| 444 | + """ |
| 445 | + Use LLM to rename PDF fields based on full PDF context with field markers. |
| 446 | + |
| 447 | + Args: |
| 448 | + pdf_path: Path to the PDF file |
| 449 | + original_field_names: List of original field names from the PDF |
| 450 | + openai_creds: OpenAI credentials to use for the API call |
| 451 | + api_key: explicit API key to use (overrides creds and env vars) |
| 452 | + model: the OpenAI model to use (default: gpt-5-nano) |
| 453 | + |
| 454 | + Returns: |
| 455 | + Dictionary mapping original field names to new Assembly Line names |
| 456 | + """ |
| 457 | + if not original_field_names: |
| 458 | + return {} |
| 459 | + |
| 460 | + try: |
| 461 | + # Import here to avoid circular imports |
| 462 | + from .pdf_wrangling import get_original_text_with_fields |
| 463 | + |
| 464 | + # Get PDF text with field markers |
| 465 | + with tempfile.NamedTemporaryFile(mode='w+', suffix='.txt', delete=False) as temp_file: |
| 466 | + try: |
| 467 | + get_original_text_with_fields(pdf_path, temp_file.name) |
| 468 | + |
| 469 | + # Read the text with field markers |
| 470 | + with open(temp_file.name, 'r', encoding='utf-8') as f: |
| 471 | + pdf_text_with_fields = f.read() |
| 472 | + |
| 473 | + finally: |
| 474 | + # Clean up temp file |
| 475 | + try: |
| 476 | + os.unlink(temp_file.name) |
| 477 | + except: |
| 478 | + pass |
| 479 | + |
| 480 | + if not pdf_text_with_fields or not pdf_text_with_fields.strip(): |
| 481 | + # Fallback: if we can't get text with field markers, use basic approach |
| 482 | + print("Warning: Could not extract PDF text with field markers, falling back to regex approach") |
| 483 | + return {name: regex_norm_field(re_case(name)) for name in original_field_names} |
| 484 | + |
| 485 | + # Load the field labeling prompt |
| 486 | + system_message = _load_prompt("field_labeling") |
| 487 | + |
| 488 | + # For GPT-5-nano: Support up to 30 pages (roughly 100K tokens input, well within 400K limit) |
| 489 | + # Estimate: 30 pages * ~1300 tokens/page = ~39K tokens for PDF text |
| 490 | + # Plus prompt and field list = ~50K total input tokens (comfortable margin) |
| 491 | + max_pdf_text_chars = 300000 # Roughly 75K tokens worth of text |
| 492 | + |
| 493 | + user_message = f"""Here is the PDF form text with field markers: |
| 494 | +
|
| 495 | +{pdf_text_with_fields[:max_pdf_text_chars]} |
| 496 | +
|
| 497 | +Original field names to rename: |
| 498 | +{json.dumps(original_field_names, indent=2)} |
| 499 | +
|
| 500 | +Please analyze the context around each field marker and provide appropriate Assembly Line variable names.""" |
| 501 | + |
| 502 | + # Call the LLM with much higher limits for GPT-5-nano |
| 503 | + response = text_complete( |
| 504 | + system_message=system_message, |
| 505 | + user_message=user_message, |
| 506 | + max_tokens=15000, # Increased for larger field lists and more detailed reasoning |
| 507 | + creds=openai_creds, |
| 508 | + api_key=api_key, |
| 509 | + model=model, |
| 510 | + ) |
| 511 | + |
| 512 | + # Parse the response |
| 513 | + if isinstance(response, dict): |
| 514 | + field_mappings = response.get("field_mappings", {}) |
| 515 | + elif isinstance(response, str): |
| 516 | + try: |
| 517 | + parsed_response = json.loads(response) |
| 518 | + field_mappings = parsed_response.get("field_mappings", {}) |
| 519 | + except json.JSONDecodeError: |
| 520 | + raise ValueError(f"Failed to parse JSON response: {response}") |
| 521 | + else: |
| 522 | + raise ValueError(f"Unexpected response type: {type(response)}") |
| 523 | + |
| 524 | + # Validate the response |
| 525 | + if not isinstance(field_mappings, dict): |
| 526 | + raise ValueError("field_mappings is not a dictionary") |
| 527 | + |
| 528 | + # Ensure all original fields are mapped |
| 529 | + mapped_fields = set(field_mappings.keys()) |
| 530 | + original_fields = set(original_field_names) |
| 531 | + |
| 532 | + missing_fields = original_fields - mapped_fields |
| 533 | + extra_fields = mapped_fields - original_fields |
| 534 | + |
| 535 | + # Handle missing fields with fallback |
| 536 | + for missing_field in missing_fields: |
| 537 | + fallback_name = regex_norm_field(re_case(missing_field)) |
| 538 | + field_mappings[missing_field] = fallback_name |
| 539 | + print(f"Warning: LLM didn't map '{missing_field}', using fallback: '{fallback_name}'") |
| 540 | + |
| 541 | + # Remove extra fields that weren't in the original list |
| 542 | + for extra_field in extra_fields: |
| 543 | + del field_mappings[extra_field] |
| 544 | + print(f"Warning: LLM provided mapping for unknown field '{extra_field}', removing") |
| 545 | + |
| 546 | + # Handle duplicates by adding suffixes |
| 547 | + final_mappings = {} |
| 548 | + used_names = set() |
| 549 | + |
| 550 | + for original_name in original_field_names: |
| 551 | + new_name = field_mappings.get(original_name, original_name) |
| 552 | + |
| 553 | + # If this name is already used, add a suffix |
| 554 | + if new_name in used_names: |
| 555 | + counter = 2 |
| 556 | + base_name = new_name |
| 557 | + while f"{base_name}__{counter}" in used_names: |
| 558 | + counter += 1 |
| 559 | + new_name = f"{base_name}__{counter}" |
| 560 | + |
| 561 | + final_mappings[original_name] = new_name |
| 562 | + used_names.add(new_name) |
| 563 | + |
| 564 | + return final_mappings |
| 565 | + |
| 566 | + except Exception as ex: |
| 567 | + print(f"Failed to rename fields with LLM: {ex}") |
| 568 | + |
| 569 | + # Fallback: use regex-based approach |
| 570 | + fallback_mappings = {} |
| 571 | + used_names = set() |
| 572 | + |
| 573 | + for original_name in original_field_names: |
| 574 | + new_name = regex_norm_field(re_case(original_name)) |
| 575 | + |
| 576 | + # Handle duplicates |
| 577 | + if new_name in used_names: |
| 578 | + counter = 2 |
| 579 | + base_name = new_name |
| 580 | + while f"{base_name}__{counter}" in used_names: |
| 581 | + counter += 1 |
| 582 | + new_name = f"{base_name}__{counter}" |
| 583 | + |
| 584 | + fallback_mappings[original_name] = new_name |
| 585 | + used_names.add(new_name) |
| 586 | + |
| 587 | + return fallback_mappings |
| 588 | + |
| 589 | + |
436 | 590 | # Take a list of AL variables and spits out suggested groupings. Here's what's going on: |
437 | 591 | # |
438 | 592 | # 1. It reads in a list of fields (e.g., `["user_name","user_address"]`) |
@@ -964,8 +1118,9 @@ def text_complete( |
964 | 1118 |
|
965 | 1119 | # GPT-5 models use max_completion_tokens instead of max_tokens and need more tokens due to reasoning |
966 | 1120 | if model.startswith("gpt-5"): |
967 | | - # Increase tokens significantly for GPT-5 models to account for reasoning tokens |
968 | | - completion_params["max_completion_tokens"] = max_tokens * 10 |
| 1121 | + # Increase tokens for GPT-5 models but respect the 128K completion token limit |
| 1122 | + requested_tokens = min(max_tokens * 5, 128000) # 5x multiplier but capped at 128K |
| 1123 | + completion_params["max_completion_tokens"] = requested_tokens |
969 | 1124 | else: |
970 | 1125 | completion_params["max_tokens"] = max_tokens |
971 | 1126 | completion_params["temperature"] = temperature |
@@ -1011,15 +1166,18 @@ def complete_with_command( |
1011 | 1166 | api_key: Optional[str] = None, |
1012 | 1167 | ) -> str: |
1013 | 1168 | """Combines some text with a command to send to open ai.""" |
1014 | | - # OpenAI's max number of tokens length is 4097, so we trim the input text to 4080 - command - tokens length. |
1015 | | - # A bit less than 4097 in case the tokenizer is wrong |
1016 | | - # don't deal with negative numbers, clip at 1 (OpenAi will error anyway) |
1017 | | - max_length = max(4080 - len(tokenizer(command)["input_ids"]) - tokens, 1) |
| 1169 | + # For GPT-5-nano: 400K input token limit, so we can handle much larger inputs |
| 1170 | + # Support up to 30 pages of PDF text (~300K characters = ~75K tokens) |
| 1171 | + # Reserve space for command and response tokens: 375K - command - response = ~300K for text |
| 1172 | + max_input_tokens = 300000 # Conservative limit for input text |
| 1173 | + max_length = max(max_input_tokens - len(tokenizer(command)["input_ids"]) - tokens, 1) |
| 1174 | + |
1018 | 1175 | text_tokens = tokenizer(text) |
1019 | 1176 | if len(text_tokens["input_ids"]) > max_length: |
1020 | 1177 | text = tokenizer.decode( |
1021 | 1178 | tokenizer(text, truncation=True, max_length=max_length)["input_ids"] |
1022 | 1179 | ) |
| 1180 | + |
1023 | 1181 | result = text_complete( |
1024 | 1182 | system_message=command, |
1025 | 1183 | user_message=text, |
@@ -1050,7 +1208,7 @@ def describe_form( |
1050 | 1208 | text, creds: Optional[OpenAiCreds] = None, api_key: Optional[str] = None |
1051 | 1209 | ) -> str: |
1052 | 1210 | command = _load_prompt("describe_form") |
1053 | | - return complete_with_command(text, command, 1000, creds=creds, api_key=api_key) |
| 1211 | + return complete_with_command(text, command, 3000, creds=creds, api_key=api_key) # Increased for more detailed descriptions |
1054 | 1212 |
|
1055 | 1213 |
|
1056 | 1214 | def needs_calculations(text: str) -> bool: |
@@ -1416,27 +1574,67 @@ def parse_form( |
1416 | 1574 | title = "(Untitled)" |
1417 | 1575 | nsmi = spot(title + ". " + text, token=spot_token) if spot_token else [] |
1418 | 1576 | if normalize: |
1419 | | - length = len(field_names) |
1420 | | - last = "null" |
1421 | | - new_names = [] |
1422 | | - new_names_conf = [] |
1423 | | - for i, field_name in enumerate(field_names): |
1424 | | - new_name, new_confidence = normalize_name( |
1425 | | - jur or "", |
1426 | | - cat or "", |
1427 | | - i, |
1428 | | - i / length, |
1429 | | - last, |
1430 | | - field_name, |
1431 | | - tools_token=tools_token, |
1432 | | - ) |
1433 | | - new_names.append(new_name) |
1434 | | - new_names_conf.append(new_confidence) |
1435 | | - last = field_name |
1436 | | - new_names = [ |
1437 | | - v + "__" + str(new_names[:i].count(v) + 1) if new_names.count(v) > 1 else v |
1438 | | - for i, v in enumerate(new_names) |
1439 | | - ] |
| 1577 | + # Use enhanced LLM-powered field renaming with PDF context |
| 1578 | + if (openai_creds or resolved_api_key) and field_names: |
| 1579 | + try: |
| 1580 | + field_mappings = rename_pdf_fields_with_context( |
| 1581 | + in_file, |
| 1582 | + field_names, |
| 1583 | + openai_creds=openai_creds, |
| 1584 | + api_key=resolved_api_key, |
| 1585 | + ) |
| 1586 | + new_names = [field_mappings.get(name, name) or name for name in field_names] |
| 1587 | + # Set high confidence for LLM-generated names |
| 1588 | + new_names_conf = [0.8 if field_mappings.get(name) else 0.1 for name in field_names] |
| 1589 | + llm_renamed_count = len([n for n in new_names if n and not n.startswith('*')]) |
| 1590 | + print(f"Successfully renamed {llm_renamed_count} fields using LLM") |
| 1591 | + except Exception as e: |
| 1592 | + print(f"LLM field renaming failed: {e}, falling back to traditional approach") |
| 1593 | + # Fallback to traditional approach |
| 1594 | + length = len(field_names) |
| 1595 | + last = "null" |
| 1596 | + new_names = [] |
| 1597 | + new_names_conf = [] |
| 1598 | + for i, field_name in enumerate(field_names): |
| 1599 | + new_name, new_confidence = normalize_name( |
| 1600 | + jur or "", |
| 1601 | + cat or "", |
| 1602 | + i, |
| 1603 | + i / length, |
| 1604 | + last, |
| 1605 | + field_name, |
| 1606 | + tools_token=tools_token, |
| 1607 | + ) |
| 1608 | + new_names.append(new_name) |
| 1609 | + new_names_conf.append(new_confidence) |
| 1610 | + last = field_name |
| 1611 | + new_names = [ |
| 1612 | + v + "__" + str(new_names[:i].count(v) + 1) if new_names.count(v) > 1 else v |
| 1613 | + for i, v in enumerate(new_names) |
| 1614 | + ] |
| 1615 | + else: |
| 1616 | + # Traditional approach when no OpenAI credentials available |
| 1617 | + length = len(field_names) |
| 1618 | + last = "null" |
| 1619 | + new_names = [] |
| 1620 | + new_names_conf = [] |
| 1621 | + for i, field_name in enumerate(field_names): |
| 1622 | + new_name, new_confidence = normalize_name( |
| 1623 | + jur or "", |
| 1624 | + cat or "", |
| 1625 | + i, |
| 1626 | + i / length, |
| 1627 | + last, |
| 1628 | + field_name, |
| 1629 | + tools_token=tools_token, |
| 1630 | + ) |
| 1631 | + new_names.append(new_name) |
| 1632 | + new_names_conf.append(new_confidence) |
| 1633 | + last = field_name |
| 1634 | + new_names = [ |
| 1635 | + v + "__" + str(new_names[:i].count(v) + 1) if new_names.count(v) > 1 else v |
| 1636 | + for i, v in enumerate(new_names) |
| 1637 | + ] |
1440 | 1638 | else: |
1441 | 1639 | new_names = field_names |
1442 | 1640 | new_names_conf = [] |
|
0 commit comments