|
25 | 25 | FieldType, |
26 | 26 | unlock_pdf_in_place, |
27 | 27 | is_tagged, |
| 28 | + get_original_text_with_fields, |
28 | 29 | ) |
29 | 30 |
|
30 | 31 | import math |
@@ -112,7 +113,7 @@ def _truncate_to_token_limit( |
112 | 113 | return text |
113 | 114 | return encoding.decode(tokens[:max_tokens]) |
114 | 115 |
|
115 | | -stop_words = { |
| 116 | +STOP_WORDS = { |
116 | 117 | 'a','about','above','after','again','against','all','am','an','and','any','are','aren','as','at', |
117 | 118 | 'be','because','been','before','being','below','between','both','but','by', |
118 | 119 | 'could','did','do','does','doing','down','during', |
@@ -311,24 +312,7 @@ def reformat_field(text: str, max_length: int = 30, tools_token: Optional[str] = |
311 | 312 | if word not in deduped_sentence: |
312 | 313 | deduped_sentence.append(word) |
313 | 314 | # Use a local hardcoded stop word list (exported from passive voice detection) |
314 | | - local_stop_words = { |
315 | | - 'a','about','above','after','again','against','all','am','an','and','any','are','aren','as','at', |
316 | | - 'be','because','been','before','being','below','between','both','but','by', |
317 | | - 'could','did','do','does','doing','down','during', |
318 | | - 'each','few','for','from','further', |
319 | | - 'had','has','have','having','he','her','here','hers','herself','him','himself','his','how', |
320 | | - 'i','if','in','into','is','it','its','itself', |
321 | | - 'just', |
322 | | - 'me','more','most','my','myself', |
323 | | - 'no','nor','not', |
324 | | - 'of','off','on','once','only','or','other','our','ours','ourselves','out','over','own', |
325 | | - 'same','she','should','so','some','such', |
326 | | - 'than','that','the','their','theirs','them','themselves','then','there','these','they','this','those','through','to','too', |
327 | | - 'under','until','up','very', |
328 | | - 'was','we','were','what','when','where','which','while','who','whom','why','will','with','you','your','yours','yourself','yourselves' |
329 | | - } |
330 | | - |
331 | | - filtered_sentence = [w for w in deduped_sentence if w.lower() not in local_stop_words] |
| 315 | + filtered_sentence = [w for w in deduped_sentence if w.lower() not in STOP_WORDS] |
332 | 316 | candidate_words = filtered_sentence or deduped_sentence |
333 | 317 |
|
334 | 318 | sanitized_words: List[str] = [] |
@@ -569,9 +553,6 @@ def rename_pdf_fields_with_context( |
569 | 553 | return {} |
570 | 554 |
|
571 | 555 | try: |
572 | | - # Import here to avoid circular imports |
573 | | - from .pdf_wrangling import get_original_text_with_fields |
574 | | - |
575 | 556 | # Get PDF text with field markers |
576 | 557 | with tempfile.NamedTemporaryFile(mode='w+', suffix='.txt', delete=False) as temp_file: |
577 | 558 | try: |
|
0 commit comments