langchain-ai · gafda · Jul 9, 2025
diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -431,42 +431,46 @@ def extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
         import pypdf
         from PIL import Image
 
-        if "/XObject" not in cast(dict, page["/Resources"]).keys():
+        if "/Resources" not in page or "/XObject" not in cast(dict, page["/Resources"]).keys():
             return ""
 
         xObject = page["/Resources"]["/XObject"].get_object()
         images = []
         for obj in xObject:
-            np_image: Any = None
-            if xObject[obj]["/Subtype"] == "/Image":
-                img_filter = (
-                    xObject[obj]["/Filter"][1:]
-                    if type(xObject[obj]["/Filter"]) is pypdf.generic._base.NameObject
-                    else xObject[obj]["/Filter"][0][1:]
-                )
-                if img_filter in _PDF_FILTER_WITHOUT_LOSS:
-                    height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]
-
-                    np_image = np.frombuffer(
-                        xObject[obj].get_data(), dtype=np.uint8
-                    ).reshape(height, width, -1)
-                elif img_filter in _PDF_FILTER_WITH_LOSS:
-                    np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data())))
-
-                else:
-                    logger.warning("Unknown PDF Filter!")
-                if np_image is not None:
-                    image_bytes = io.BytesIO()
+            try:
+                np_image: Any = None
+                if xObject[obj]["/Subtype"] == "/Image":
+                    img_filter = (
+                        xObject[obj]["/Filter"][1:]
+                        if type(xObject[obj]["/Filter"]) is pypdf.generic._base.NameObject
+                        else xObject[obj]["/Filter"][0][1:]
+                    )
+                    if img_filter in _PDF_FILTER_WITHOUT_LOSS:
+                        height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]
 
-                    if image_bytes.getbuffer().nbytes == 0:
-                        continue
+                        np_image = np.frombuffer(
+                            xObject[obj].get_data(), dtype=np.uint8
+                        ).reshape(height, width, -1)
+                    elif img_filter in _PDF_FILTER_WITH_LOSS:
+                        np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data())))
 
-                    Image.fromarray(np_image).save(image_bytes, format="PNG")
-                    blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
-                    image_text = next(self.images_parser.lazy_parse(blob)).page_content
-                    images.append(
-                        _format_inner_image(blob, image_text, self.images_inner_format)
-                    )
+                    else:
+                        logger.warning("Unknown PDF Filter!")
+                    if np_image is not None:
+                        image_bytes = io.BytesIO()
+                        Image.fromarray(np_image).save(image_bytes, format="PNG")
+
+                        if image_bytes.getbuffer().nbytes == 0:
+                            continue
+
+                        blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
+                        image_text = next(self.images_parser.lazy_parse(blob)).page_content
+                        images.append(
+                            _format_inner_image(blob, image_text, self.images_inner_format)
+                        )
+            except Exception as e:
+                logger.warning(f"Failed to extract image from PDF: {e}")
+                continue
         return _FORMAT_IMAGE_STR.format(
             image_text=_JOIN_IMAGES.join(filter(None, images))
         )

diff --git a/libs/community/pyproject.toml b/libs/community/pyproject.toml
@@ -75,6 +75,13 @@ typing = [
     "langchain-text-splitters",
     "langchain",
 ]
+pdf = [
+    "pdfminer-six>=20250506",
+    "pdfplumber>=0.11.7",
+    "pymupdf>=1.21.0",
+    "pypdf>=5.7.0",
+    "unstructured>=0.18.3",
+]
 
 [tool.ruff]
 target-version = "py39"
@@ -84,9 +91,9 @@ exclude = [
 ]
 
 [tool.mypy]
-ignore_missing_imports = "True"
-disallow_untyped_defs = "True"
-warn_unused_ignores = "True"
+ignore_missing_imports = true
+disallow_untyped_defs = true
+warn_unused_ignores = true
 
 [tool.codespell]
 skip = ".git,*.pdf,*.svg,*.pdf,*.yaml,*.ipynb,poetry.lock,*.min.js,*.css,package-lock.json,example_data,_dist,examples,*.trig"