Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,12 +134,15 @@ To find available plugins, search GitHub for the hashtag `#markitdown-plugin`. T

### Azure Document Intelligence

To use Microsoft Document Intelligence for conversion:
To use Microsoft Document Intelligence for conversion from the command line:

```bash
export AZURE_API_KEY="<document_intelligence_api_key>"
markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
```

If `AZURE_API_KEY` is not set, MarkItDown will fall back to `DefaultAzureCredential`.

More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)

### Python API
Expand All @@ -154,16 +157,25 @@ result = md.convert("test.xlsx")
print(result.text_content)
```

Document Intelligence conversion in Python:
Document Intelligence conversion in Python using an API key:

```python
import os

from azure.core.credentials import AzureKeyCredential
from markitdown import MarkItDown

md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
md = MarkItDown(
docintel_endpoint=os.environ["AZURE_DOC_INT_ENDPOINT"],
docintel_credential=AzureKeyCredential(os.environ["AZURE_API_KEY"]),
docintel_file_types=["pdf"],
)
result = md.convert("test.pdf")
print(result.text_content)
```

If you omit `docintel_credential`, MarkItDown will use `AZURE_API_KEY` when it is set and otherwise fall back to `DefaultAzureCredential`.

To use Large Language Models for image descriptions (currently only for pptx and image files), provide `llm_client` and `llm_model`:

```python
Expand Down
1 change: 1 addition & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,7 @@ def _convert(

# Check if the converter will accept the file, and if so, try to convert it
_accepts = False
res: Optional[DocumentConverterResult] = None
try:
_accepts = converter.accepts(file_stream, stream_info, **_kwargs)
except NotImplementedError:
Expand Down
23 changes: 23 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,29 @@ def test_exceptions() -> None:
assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter"


def test_converter_failure_does_not_raise_unboundlocalerror() -> None:
class BrokenPdfConverter:
def accepts(self, file_stream, stream_info, **kwargs):
return stream_info.extension == ".pdf"

def convert(self, file_stream, stream_info, **kwargs):
raise RuntimeError("broken converter")

markitdown = MarkItDown(enable_builtins=False, enable_plugins=False)
markitdown.register_converter(BrokenPdfConverter())

with pytest.raises(FileConversionException) as exc_info:
markitdown._convert(
file_stream=io.BytesIO(b"%PDF-broken"),
stream_info_guesses=[
StreamInfo(extension=".pdf", mimetype="application/pdf")
],
)

assert len(exc_info.value.attempts) == 1
assert type(exc_info.value.attempts[0].converter).__name__ == "BrokenPdfConverter"


@pytest.mark.skipif(
skip_exiftool,
reason="do not run if exiftool is not installed",
Expand Down