Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion packages/markitdown/src/markitdown/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,12 @@ def main():
help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
)

parser.add_argument(
"--export-data-uris",
metavar="PREFIX",
help="Export data URIs, saving them with the selected prefix, which can contain a path. Saved files will have the path '{PREFIX}image{N}.{ext}' where 'N' is a sequential number and 'ext' is the file extension matching the image MIME type.",
)

parser.add_argument("filename", nargs="?")
args = parser.parse_args()

Expand Down Expand Up @@ -186,15 +192,25 @@ def main():
else:
markitdown = MarkItDown(enable_plugins=args.use_plugins)

if args.export_data_uris:
if args.keep_data_uris:
_exit_with_error(
"--export-data-uris and --keep-data-uris are mutually exclusive."
)

if args.filename is None:
result = markitdown.convert_stream(
sys.stdin.buffer,
stream_info=stream_info,
keep_data_uris=args.keep_data_uris,
export_data_uris=args.export_data_uris,
)
else:
result = markitdown.convert(
args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
args.filename,
stream_info=stream_info,
keep_data_uris=args.keep_data_uris,
export_data_uris=args.export_data_uris,
)

_handle_output(args, result)
Expand Down
24 changes: 21 additions & 3 deletions packages/markitdown/src/markitdown/converters/_markdownify.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import base64
from pathlib import Path
import re
import markdownify

Expand All @@ -18,7 +20,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
def __init__(self, **options: Any):
options["heading_style"] = options.get("heading_style", markdownify.ATX)
options["keep_data_uris"] = options.get("keep_data_uris", False)
options["export_data_uris"] = options.get("export_data_uris", False)
# Explicitly cast options to the expected type if necessary
self.save_uri_index = 0
super().__init__(**options)

def convert_hn(
Expand Down Expand Up @@ -101,9 +105,23 @@ def convert_img(
):
return alt

# Remove dataURIs
if src.startswith("data:") and not self.options["keep_data_uris"]:
src = src.split(",")[0] + "..."
# Handle dataURIs
if src.startswith("data:"):
if self.options["export_data_uris"]:
if (m := re.match(r"data:image/([^;]+);base64,(.*)", src)) is not None:
self.save_uri_index += 1
src = Path(
self.options["export_data_uris"]
+ f"image{self.save_uri_index}.{m.group(1)}"
)
if not src.parent.exists():
src.parent.mkdir(parents=True)
with open(src, "wb") as fh:
fh.write(base64.b64decode(m.group(2)))
else:
src = src.split(",")[0] + "..."
elif not self.options["keep_data_uris"]:
src = src.split(",")[0] + "..."

return "![%s](%s%s)" % (alt, src, title_part)

Expand Down