Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 15 additions & 8 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -4525,23 +4525,30 @@ def save_as_html(

with open(filename, "w", encoding="utf-8") as fw:
fw.write(html_out)

def _get_output_paths(
self, filename: Union[str, Path], artifacts_dir: Optional[Path] = None
) -> Tuple[Path, Optional[Path]]:
"""
Determines the output directory for artifacts and the reference path for URIs.
"""
if isinstance(filename, str):
filename = Path(filename)
if artifacts_dir is None:
# Remove the extension and add '_pictures'
artifacts_dir = filename.with_suffix("")
artifacts_dir = artifacts_dir.with_name(artifacts_dir.name + "_artifacts")
if artifacts_dir.is_absolute():
# Default case: create an '_artifacts' directory alongside the file.
final_artifacts_dir = filename.with_name(filename.stem + "_artifacts")
else:
if isinstance(artifacts_dir, str):
artifacts_dir = Path(artifacts_dir)
if artifacts_dir.is_absolute():
final_artifacts_dir = artifacts_dir
else:
final_artifacts_dir = filename.parent / artifacts_dir
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure about this line. Depending on from where this function is called, the use case can be different:

  • for example, if called from save_as_json() with image referenced flag, then one can have a use-case where images are stored completely separately from the json document itself, but path is still relative
  • the other example is when both save_as_json() and save_as_markdown() used, with image referenced flag, one could use the same artifact directory to store images and avoid duplication. To clarify, an example would be filename="../results/json/my_doc.json" and filename="../results/md/my_doc.md" with a shared reference images artifacts_dir="../results/ref_images/my_doc/"

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My proposal would be to not modify artifacts_dir in any way if value is provided.

if final_artifacts_dir.is_absolute():
reference_path = None
else:
reference_path = filename.parent
artifacts_dir = reference_path / artifacts_dir

return artifacts_dir, reference_path
return final_artifacts_dir, reference_path

def _make_copy_with_refmode(
self,
Expand Down
Loading