|
| 1 | +"""Debug script to see what Docling extracts from a PDF. |
| 2 | +
|
| 3 | +Usage: |
| 4 | + poetry run python dev_tests/manual/test_docling_debug.py |
| 5 | +""" |
| 6 | + |
| 7 | +from docling.document_converter import DocumentConverter # type: ignore[import-not-found] |
| 8 | + |
| 9 | + |
| 10 | +def main(): |
| 11 | + """Debug Docling extraction.""" |
| 12 | + pdf_path = "/Users/mg/Downloads/407ETR.pdf" |
| 13 | + |
| 14 | + print("=" * 80) |
| 15 | + print("DOCLING DEBUG - Raw Extraction") |
| 16 | + print("=" * 80) |
| 17 | + print(f"\n📄 Processing: {pdf_path}\n") |
| 18 | + |
| 19 | + # Convert with Docling |
| 20 | + converter = DocumentConverter() |
| 21 | + result = converter.convert(pdf_path) |
| 22 | + |
| 23 | + doc = result.document |
| 24 | + |
| 25 | + print("✅ Document converted successfully") |
| 26 | + print("\n📋 Document Metadata:") |
| 27 | + print(f" - Has metadata attr: {hasattr(doc, 'metadata')}") |
| 28 | + if hasattr(doc, "metadata"): |
| 29 | + print(f" - Metadata: {doc.metadata}") |
| 30 | + |
| 31 | + print("\n🔍 Document Structure:") |
| 32 | + print(f" - Has iterate_items: {hasattr(doc, 'iterate_items')}") |
| 33 | + |
| 34 | + if hasattr(doc, "iterate_items"): |
| 35 | + items = list(doc.iterate_items()) |
| 36 | + print(f" - Total items: {len(items)}") |
| 37 | + |
| 38 | + if items: |
| 39 | + print("\n📝 Item Types:") |
| 40 | + item_types = {} |
| 41 | + for item in items: |
| 42 | + item_type = type(item).__name__ |
| 43 | + item_types[item_type] = item_types.get(item_type, 0) + 1 |
| 44 | + |
| 45 | + for item_type, count in item_types.items(): |
| 46 | + print(f" - {item_type}: {count}") |
| 47 | + |
| 48 | + print("\n🔎 First 5 items (checking page info):") |
| 49 | + for i, item_data in enumerate(items[:5]): |
| 50 | + print(f"\n --- Item {i+1} ---") |
| 51 | + |
| 52 | + # Extract actual item from tuple |
| 53 | + if isinstance(item_data, tuple): |
| 54 | + item = item_data[0] |
| 55 | + level = item_data[1] if len(item_data) > 1 else None |
| 56 | + print(f" Tuple: (item, level={level})") |
| 57 | + else: |
| 58 | + item = item_data |
| 59 | + print(" Direct item") |
| 60 | + |
| 61 | + print(f" Type: {type(item).__name__}") |
| 62 | + |
| 63 | + # Check for text |
| 64 | + if hasattr(item, "text"): |
| 65 | + text = str(item.text)[:80] |
| 66 | + print(f" Text: {text}...") |
| 67 | + |
| 68 | + # Check for provenance (page info) |
| 69 | + if hasattr(item, "prov"): |
| 70 | + prov = item.prov |
| 71 | + print(" Has prov: True") |
| 72 | + print(f" Prov type: {type(prov)}") |
| 73 | + print(f" Prov value: {prov}") |
| 74 | + |
| 75 | + # If it's a list, check first element |
| 76 | + if isinstance(prov, list) and len(prov) > 0: |
| 77 | + print(f" Prov[0] type: {type(prov[0])}") |
| 78 | + print(f" Prov[0] value: {prov[0]}") |
| 79 | + if hasattr(prov[0], "page"): |
| 80 | + print(f" Prov[0].page: {prov[0].page}") |
| 81 | + if hasattr(prov[0], "__dict__"): |
| 82 | + print(f" Prov[0] attrs: {prov[0].__dict__}") |
| 83 | + else: |
| 84 | + print(" Has prov: False") |
| 85 | + |
| 86 | + # Check for page_no attribute directly |
| 87 | + if hasattr(item, "page_no"): |
| 88 | + print(f" item.page_no: {item.page_no}") |
| 89 | + if hasattr(item, "page"): |
| 90 | + print(f" item.page: {item.page}") |
| 91 | + else: |
| 92 | + print(f" Attributes: {dir(item)[:10]}...") # Show first 10 attrs |
| 93 | + |
| 94 | + # Try to get text |
| 95 | + if hasattr(item, "text"): |
| 96 | + text = item.text[:100] if len(item.text) > 100 else item.text |
| 97 | + print(f" Text: {text}...") |
| 98 | + |
| 99 | + # Try to get page |
| 100 | + if hasattr(item, "prov"): |
| 101 | + print(f" Provenance: {item.prov}") |
| 102 | + else: |
| 103 | + print(" ⚠️ No items found!") |
| 104 | + print("\n This could mean:") |
| 105 | + print(" 1. PDF is image-based and needs OCR") |
| 106 | + print(" 2. PDF structure isn't recognized") |
| 107 | + print(" 3. Content is in a different format") |
| 108 | + |
| 109 | + # Check if we can export to markdown |
| 110 | + print("\n📄 Export Options:") |
| 111 | + if hasattr(doc, "export_to_markdown"): |
| 112 | + print(" - Has export_to_markdown") |
| 113 | + try: |
| 114 | + md = doc.export_to_markdown() |
| 115 | + print(f" - Markdown length: {len(md)} chars") |
| 116 | + print(f" - Markdown preview:\n{md[:500]}") |
| 117 | + except Exception as e: |
| 118 | + print(f" - Export failed: {e}") |
| 119 | + |
| 120 | + if hasattr(doc, "export_to_text"): |
| 121 | + print(" - Has export_to_text") |
| 122 | + try: |
| 123 | + text = doc.export_to_text() |
| 124 | + print(f" - Text length: {len(text)} chars") |
| 125 | + print(f" - Text preview:\n{text[:500]}") |
| 126 | + except Exception as e: |
| 127 | + print(f" - Export failed: {e}") |
| 128 | + |
| 129 | + print("\n" + "=" * 80) |
| 130 | + |
| 131 | + |
| 132 | +if __name__ == "__main__": |
| 133 | + main() |
0 commit comments