Support text, JSON, XML and YAML DocumentUrl and BinaryContent on OpenAI (#2851)

pulphix · DouweM · web-flow · commit 5287abfe2a0b · 2025-09-30T15:47:40.000-06:00
Co-authored-by: Douwe Maan &lt;douwe@pydantic.dev&gt;
diff --git a/pydantic_ai_slim/pydantic_ai/messages.py b/pydantic_ai_slim/pydantic_ai/messages.py
@@ -114,6 +114,20 @@ class FileUrl(ABC):
 
     _: KW_ONLY
 
+    identifier: str
+    """The identifier of the file, such as a unique ID. generating one from the url if not explicitly set.
+
+    This identifier can be provided to the model in a message to allow it to refer to this file in a tool call argument,
+    and the tool can look up the file in question by iterating over the message history and finding the matching `FileUrl`.
+
+    This identifier is only automatically passed to the model when the `FileUrl` is returned by a tool.
+    If you're passing the `FileUrl` as a user message, it's up to you to include a separate text part with the identifier,
+    e.g. "This is file <identifier>:" preceding the `FileUrl`.
+
+    It's also included in inline-text delimiters for providers that require inlining text documents, so the model can
+    distinguish multiple files.
+    """
+
     force_download: bool = False
     """If the model supports it:
 
@@ -133,17 +147,6 @@ class FileUrl(ABC):
         compare=False, default=None
     )
 
-    identifier: str | None = None
-    """The identifier of the file, such as a unique ID. generating one from the url if not explicitly set
-
-    This identifier can be provided to the model in a message to allow it to refer to this file in a tool call argument,
-    and the tool can look up the file in question by iterating over the message history and finding the matching `FileUrl`.
-
-    This identifier is only automatically passed to the model when the `FileUrl` is returned by a tool.
-    If you're passing the `FileUrl` as a user message, it's up to you to include a separate text part with the identifier,
-    e.g. "This is file <identifier>:" preceding the `FileUrl`.
-    """
-
     def __init__(
         self,
         url: str,
diff --git a/pydantic_ai_slim/pydantic_ai/models/openai.py b/pydantic_ai_slim/pydantic_ai/models/openai.py
@@ -750,8 +750,7 @@ async def _map_user_message(self, message: ModelRequest) -> AsyncIterable[chat.C
             else:
                 assert_never(part)
 
-    @staticmethod
-    async def _map_user_prompt(part: UserPromptPart) -> chat.ChatCompletionUserMessageParam:
+    async def _map_user_prompt(self, part: UserPromptPart) -> chat.ChatCompletionUserMessageParam:  # noqa: C901
         content: str | list[ChatCompletionContentPartParam]
         if isinstance(part.content, str):
             content = part.content
@@ -766,28 +765,40 @@ async def _map_user_prompt(part: UserPromptPart) -> chat.ChatCompletionUserMessa
                         image_url['detail'] = metadata.get('detail', 'auto')
                     content.append(ChatCompletionContentPartImageParam(image_url=image_url, type='image_url'))
                 elif isinstance(item, BinaryContent):
-                    base64_encoded = base64.b64encode(item.data).decode('utf-8')
-                    if item.is_image:
-                        image_url: ImageURL = {'url': f'data:{item.media_type};base64,{base64_encoded}'}
-                        if metadata := item.vendor_metadata:
-                            image_url['detail'] = metadata.get('detail', 'auto')
-                        content.append(ChatCompletionContentPartImageParam(image_url=image_url, type='image_url'))
-                    elif item.is_audio:
-                        assert item.format in ('wav', 'mp3')
-                        audio = InputAudio(data=base64_encoded, format=item.format)
-                        content.append(ChatCompletionContentPartInputAudioParam(input_audio=audio, type='input_audio'))
-                    elif item.is_document:
+                    if self._is_text_like_media_type(item.media_type):
+                        # Inline text-like binary content as a text block
                         content.append(
-                            File(
-                                file=FileFile(
-                                    file_data=f'data:{item.media_type};base64,{base64_encoded}',
-                                    filename=f'filename.{item.format}',
-                                ),
-                                type='file',
+                            self._inline_text_file_part(
+                                item.data.decode('utf-8'),
+                                media_type=item.media_type,
+                                identifier=item.identifier,
                             )
                         )
-                    else:  # pragma: no cover
-                        raise RuntimeError(f'Unsupported binary content type: {item.media_type}')
+                    else:
+                        base64_encoded = base64.b64encode(item.data).decode('utf-8')
+                        if item.is_image:
+                            image_url: ImageURL = {'url': f'data:{item.media_type};base64,{base64_encoded}'}
+                            if metadata := item.vendor_metadata:
+                                image_url['detail'] = metadata.get('detail', 'auto')
+                            content.append(ChatCompletionContentPartImageParam(image_url=image_url, type='image_url'))
+                        elif item.is_audio:
+                            assert item.format in ('wav', 'mp3')
+                            audio = InputAudio(data=base64_encoded, format=item.format)
+                            content.append(
+                                ChatCompletionContentPartInputAudioParam(input_audio=audio, type='input_audio')
+                            )
+                        elif item.is_document:
+                            content.append(
+                                File(
+                                    file=FileFile(
+                                        file_data=f'data:{item.media_type};base64,{base64_encoded}',
+                                        filename=f'filename.{item.format}',
+                                    ),
+                                    type='file',
+                                )
+                            )
+                        else:  # pragma: no cover
+                            raise RuntimeError(f'Unsupported binary content type: {item.media_type}')
                 elif isinstance(item, AudioUrl):
                     downloaded_item = await download_item(item, data_format='base64', type_format='extension')
                     assert downloaded_item['data_type'] in (
@@ -797,20 +808,54 @@ async def _map_user_prompt(part: UserPromptPart) -> chat.ChatCompletionUserMessa
                     audio = InputAudio(data=downloaded_item['data'], format=downloaded_item['data_type'])
                     content.append(ChatCompletionContentPartInputAudioParam(input_audio=audio, type='input_audio'))
                 elif isinstance(item, DocumentUrl):
-                    downloaded_item = await download_item(item, data_format='base64_uri', type_format='extension')
-                    file = File(
-                        file=FileFile(
-                            file_data=downloaded_item['data'], filename=f'filename.{downloaded_item["data_type"]}'
-                        ),
-                        type='file',
-                    )
-                    content.append(file)
+                    if self._is_text_like_media_type(item.media_type):
+                        downloaded_text = await download_item(item, data_format='text')
+                        content.append(
+                            self._inline_text_file_part(
+                                downloaded_text['data'],
+                                media_type=item.media_type,
+                                identifier=item.identifier,
+                            )
+                        )
+                    else:
+                        downloaded_item = await download_item(item, data_format='base64_uri', type_format='extension')
+                        content.append(
+                            File(
+                                file=FileFile(
+                                    file_data=downloaded_item['data'],
+                                    filename=f'filename.{downloaded_item["data_type"]}',
+                                ),
+                                type='file',
+                            )
+                        )
                 elif isinstance(item, VideoUrl):  # pragma: no cover
                     raise NotImplementedError('VideoUrl is not supported for OpenAI')
                 else:
                     assert_never(item)
         return chat.ChatCompletionUserMessageParam(role='user', content=content)
 
+    @staticmethod
+    def _is_text_like_media_type(media_type: str) -> bool:
+        return (
+            media_type.startswith('text/')
+            or media_type == 'application/json'
+            or media_type.endswith('+json')
+            or media_type == 'application/xml'
+            or media_type.endswith('+xml')
+            or media_type in ('application/x-yaml', 'application/yaml')
+        )
+
+    @staticmethod
+    def _inline_text_file_part(text: str, *, media_type: str, identifier: str) -> ChatCompletionContentPartTextParam:
+        text = '\n'.join(
+            [
+                f'-----BEGIN FILE id="{identifier}" type="{media_type}"-----',
+                text,
+                f'-----END FILE id="{identifier}"-----',
+            ]
+        )
+        return ChatCompletionContentPartTextParam(text=text, type='text')
+
 
 @deprecated(
     '`OpenAIModel` was renamed to `OpenAIChatModel` to clearly distinguish it from `OpenAIResponsesModel` which '
diff --git a/tests/assets/dummy.txt b/tests/assets/dummy.txt
@@ -0,0 +1 @@
+Dummy TXT file
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -338,6 +338,13 @@ def document_content(assets_path: Path) -> BinaryContent:
     return BinaryContent(data=pdf_bytes, media_type='application/pdf')
 
 
+@pytest.fixture(scope='session')
+def text_document_content(assets_path: Path) -> BinaryContent:
+    content = assets_path.joinpath('dummy.txt').read_text()
+    bin_content = BinaryContent(data=content.encode(), media_type='text/plain')
+    return bin_content
+
+
 @pytest.fixture(scope='session')
 def deepseek_api_key() -> str:
     return os.getenv('DEEPSEEK_API_KEY', 'mock-api-key')
diff --git a/tests/models/cassettes/test_openai/test_text_document_as_binary_content_input.yaml b/tests/models/cassettes/test_openai/test_text_document_as_binary_content_input.yaml
@@ -0,0 +1,88 @@
+interactions:
+- request:
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate
+      connection:
+      - keep-alive
+      content-length:
+      - '274'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+    method: POST
+    parsed_body:
+      messages:
+      - content:
+        - text: What is the main content on this document?
+          type: text
+        - text: |-
+            -----BEGIN FILE id="f58746" type="text/plain"-----
+            Dummy TXT file
+
+            -----END FILE id="f58746"-----
+          type: text
+        role: user
+      model: gpt-4o
+      stream: false
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    headers:
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      connection:
+      - keep-alive
+      content-length:
+      - '935'
+      content-type:
+      - application/json
+      openai-organization:
+      - pydantic-28gund
+      openai-processing-ms:
+      - '656'
+      openai-project:
+      - proj_dKobscVY9YJxeEaDJen54e3d
+      openai-version:
+      - '2020-10-01'
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      transfer-encoding:
+      - chunked
+    parsed_body:
+      choices:
+      - finish_reason: stop
+        index: 0
+        logprobs: null
+        message:
+          annotations: []
+          content: The main content of the document is simply the text "Dummy TXT file." It does not appear to contain any
+            other detailed information.
+          refusal: null
+          role: assistant
+      created: 1759266774
+      id: chatcmpl-CLbwUAQGI25CvtPRgqWwhS6M2HXA3
+      model: gpt-4o-2024-08-06
+      object: chat.completion
+      service_tier: default
+      system_fingerprint: fp_f33640a400
+      usage:
+        completion_tokens: 26
+        completion_tokens_details:
+          accepted_prediction_tokens: 0
+          audio_tokens: 0
+          reasoning_tokens: 0
+          rejected_prediction_tokens: 0
+        prompt_tokens: 45
+        prompt_tokens_details:
+          audio_tokens: 0
+          cached_tokens: 0
+        total_tokens: 71
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/models/cassettes/test_openai/test_text_document_url_input.yaml b/tests/models/cassettes/test_openai/test_text_document_url_input.yaml
diff --git a/tests/models/test_openai.py b/tests/models/test_openai.py