Skip to content

Conversation

@milahu
Copy link

@milahu milahu commented Oct 25, 2025

help convert hocr files to pdf

status: abandoned draft

i store my raw scan images with 600 dpi (becaue why not...)
but to release an ebook, i want to downscale to 300 dpi
and apply some image compression

example use

hocr2pdf.py
#!/usr/bin/env python3

import os
import re
import sys
import tempfile
import shutil
from pathlib import Path
from PyPDF2 import PdfMerger

from ocrmypdf.hocrtransform import HocrTransform

# Directories
# hocr files generated with
#   tesseract 001.tiff - -c tessedit_create_hocr=1 --dpi 600 -l deu+eng \
#     --oem 1 --psm 6 --tessdata-dir ../tessdata_best >001.hocr
hocr_dir = Path("090-ocr")
temp_dir = Path(os.path.splitext(os.path.basename(__file__))[0] + "-temp")
merged_pdf_path = Path(os.path.splitext(os.path.basename(__file__))[0] + ".pdf")

temp_dir.mkdir(exist_ok=True)

# FIXME HocrTransform should parse these values from the hocr file
# Regexes
IMAGE_RE = re.compile(r'image "([^"]+)"')
DPI_RE = re.compile(r'scan_res (\d+) (\d+)')

# Get all HOCR files sorted
hocr_files = sorted(hocr_dir.glob("*.hocr"))

for hocr_file in hocr_files:
    text = hocr_file.read_text(encoding="utf-8")

    # Parse image path (relative to HOCR file)
    match = IMAGE_RE.search(text)
    if not match:
        print(f"Could not find image path in {hocr_file}")
        continue
    image_path = (hocr_file.parent / match.group(1)).resolve()

    # Parse DPI if available
    dpi_match = DPI_RE.search(text)
    dpi = int(dpi_match.group(1)) if dpi_match else 300  # fallback

    page_pdf_path = temp_dir / f"{hocr_file.stem}.pdf"
    if page_pdf_path.exists():
        print(f"keeping {page_pdf_path}")
        continue

    # Transform to PDF
    try:
        ht = HocrTransform(hocr_filename=hocr_file, dpi=dpi)
        ht.to_pdf(
            out_filename=page_pdf_path,
            image_filename=image_path,
            invisible_text=True,
            scale_factor=0.5,
            pil_image_save_kwargs={
                # 432M    100-hocr2pdf.jp2.q32.pdf
                # 410M    110-tiff2jp2-q32-magick
                # 1006M   110-tiff2jp2-q32-pillow (wtf?)
                "format": "JPEG2000",
                "quality": 32,
                # 554M    100-hocr2pdf.jpg.q50.pdf (wtf?)
                # 133M    110-tiff2jpg-q50
                # "format": "JPEG",
                # "quality": 50,
            }
        )
        print(f"writing {page_pdf_path}")
    except Exception as e:
        print(f"error processing {hocr_file}: {e}")
        raise
        continue

# Merge PDFs
merger = PdfMerger()
for page_pdf in sorted(temp_dir.glob("*.pdf")):
    merger.append(page_pdf)

merger.write(merged_pdf_path)
merger.close()
print(f"writing {merged_pdf_path}")

# Optional cleanup
# import shutil; shutil.rmtree(temp_dir)

... but what i dont understand:
apparently the compressed image_to_draw is not used
by canvas.do.draw_image
in src/ocrmypdf/hocrtransform/_hocr.py

            canvas.do.draw_image(
                image_to_draw,
                0,
                0,
                width=self.width * scale_factor,
                height=self.height * scale_factor,
            )

... so there seems to be another decode-encode step
which stores the images with a different quality
so with format="JPEG" and quality=50
i get a PDF with 554MB instead of 150MB

fixme:
with format="JPEG" the scaling does not work
the content is scaled, but the page is not scaled
and the content is at the bottom-left of the page

fixme:
why is JPEG2000 compression in imagemagick so much better than in pillow

1006M   110-tiff2jp2-pillow
410M    110-tiff2jp2-magick
img2jp2.py
#!/usr/bin/env python3
import os
from concurrent.futures import ProcessPoolExecutor, as_completed
from PIL import Image
import psutil
import subprocess
import shlex

# --- Configuration ---
# QUALITY = 50  # Compression quality (0–100)
# QUALITY = 20
# QUALITY = 10
# QUALITY = 5
# QUALITY = 2
QUALITY = 32
INPUT_DIR = "070-deskew"
OUTPUT_DIR = os.path.splitext(os.path.basename(__file__))[0]
# OUTPUT_DIR = os.path.splitext(os.path.basename(__file__))[0] + f"-q{QUALITY}" # compare qualities
MAX_WORKERS = psutil.cpu_count(logical=False) or 1  # Use all available cores
SCALE_FACTOR = 0.5

MAX_WORKERS = 1 # debug

# FIXME why is JPEG2000 compression in imagemagick so much better than in pillow
# 1006M   110-tiff2jp2-pillow
# 410M    110-tiff2jp2-magick

use_imagemagick = False
use_imagemagick = True

if use_imagemagick:
    OUTPUT_DIR = os.path.splitext(os.path.basename(__file__))[0] + "-magick"

# use imagemagick to compare jpeg2000 qualities around 30%
r'''
src=001.tiff;
for (( q=20; q<=40; q++ )); do
    dst="$src".magick.s50.q$(printf %03d "$q").jp2;
    [ -e "$dst" ] && continue;
    magick "$src" -scale 50% -quality ${q}% "$dst";
done
'''

os.makedirs(OUTPUT_DIR, exist_ok=True)

def compress_tiff_to_jpeg2000(filename):
    """Convert a TIFF image to JPEG2000 optimized for text/graphics."""
    input_path = os.path.join(INPUT_DIR, filename)
    output_name = os.path.splitext(filename)[0] + ".jp2"
    output_path = os.path.join(OUTPUT_DIR, output_name)

    if os.path.exists(output_path):
        return f"keeping {output_path}"

    if use_imagemagick:
        # use imagemagick -> better compression?
        args = [
            "magick",
            input_path,
            "-scale", f"{round(SCALE_FACTOR * 100)}%",
            output_path,
        ]
        print(">", shlex.join(args))
        subprocess.run(args)
        return f"writing {output_path}"

    try:
        with Image.open(input_path) as img:
            # TODO why?
            # if img.mode not in ("RGB", "L"):
            #     img = img.convert("RGB")

            if SCALE_FACTOR != 1.0:
                new_size = (
                    int(img.width * SCALE_FACTOR),
                    int(img.height * SCALE_FACTOR)
                )
                img = img.resize(new_size, Image.LANCZOS)

            # https://imageio.readthedocs.io/en/v2.4.1/format_jpeg2000-pil.html
            img.save(
                output_path,
                format="JPEG2000",
                quality=QUALITY,
            )
        return f"writing {output_path}"
    except Exception as e:
        raise
        return f"error {filename}: {e}"

def main():
    tiff_files = sorted(
        f for f in os.listdir(INPUT_DIR) if f.lower().endswith((".tif", ".tiff"))
    )
    if not tiff_files:
        print("no TIFF files found.")
        return

    print(f"processing {len(tiff_files)} files using {MAX_WORKERS} workers...")

    with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {executor.submit(compress_tiff_to_jpeg2000, f): f for f in tiff_files}
        for future in as_completed(futures):
            print(future.result())

if __name__ == "__main__":
    main()

alternative:
hocr-to-epub-fxl - convert hocr files of a scanned book to a fixed-layout epub
this creates about 2x smaller files due to the AVIF image format
which is not supported in PDF files
(EPUB is the future...)

related issues

@codecov
Copy link

codecov bot commented Oct 25, 2025

Codecov Report

❌ Patch coverage is 17.64706% with 14 lines in your changes missing coverage. Please review.
✅ Project coverage is 89.55%. Comparing base (f181307) to head (77a502f).

Files with missing lines Patch % Lines
src/ocrmypdf/hocrtransform/_hocr.py 17.64% 13 Missing and 1 partial ⚠️
Additional details and impacted files
@@            Coverage Diff             @@
##             main    #1586      +/-   ##
==========================================
- Coverage   89.72%   89.55%   -0.18%     
==========================================
  Files          96       96              
  Lines        7185     7202      +17     
  Branches      735      739       +4     
==========================================
+ Hits         6447     6450       +3     
- Misses        529      542      +13     
- Partials      209      210       +1     

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@jbarlow83
Copy link
Collaborator

The intended workflow in ocrmypdf is to use higher optimization level settings when you have very DPI images.
You can think of lossy compression is a "smart downsampling" while regular downsampling 600->300 is global, dumb downsampling. If you use lossy compression to get a 75% reduction in file size the result is higher quality for the same byte budget. Usually. So try -O3 and lowering both --jpeg-quality, --png-quality.

The hocrtransform step is just intended to copy the image from the input file at its existing quality level. The optimizer is the place that would make sense to introduce an option for downsampling.

ocrmypdf does not convert images to JPEG2000. Apart from superior handling of >32k pixel images, modern JPEG codecs exceed it in quality, are significantly less complex, significantly faster to render, etc.

@milahu
Copy link
Author

milahu commented Oct 26, 2025

The hocrtransform step is just intended to copy the image from the input file at its existing quality level.

but then why does the file size explode from 133MB to 554MB?
(actually this is a separate issue...)

            pil_image_save_kwargs={
                # 554M    100-hocr2pdf.jpg.q50.pdf (wtf?)
                # 133M    110-tiff2jpg-q50
                "format": "JPEG",
                "quality": 50,
            },

100-hocr2pdf.jpg.q50.pdf was created with hocr2pdf.py based on ocrmypdf
110-tiff2jpg-q50 was created with img2jpg.py based on pillow

img2jpg.py
#!/usr/bin/env python3
import os
from concurrent.futures import ProcessPoolExecutor, as_completed
from PIL import Image
import psutil

# --- Configuration ---
# QUALITY = 50  # Compression quality (0–100)
# QUALITY = 20
# QUALITY = 10
# QUALITY = 5
# QUALITY = 2
QUALITY = 50
INPUT_DIR = "070-deskew"
OUTPUT_DIR = os.path.splitext(os.path.basename(__file__))[0]
# OUTPUT_DIR = os.path.splitext(os.path.basename(__file__))[0] + f"-q{QUALITY}" # compare qualities
MAX_WORKERS = psutil.cpu_count(logical=False) or 1  # Use all available cores
SCALE_FACTOR = 0.5

# use imagemagick to compare jpeg qualities around 50%
r'''
src=001.tiff;
for (( q=10; q<=80; q+=10 )); do
    dst="$src".magick.s50.q$(printf %03d "$q").jpg;
    [ -e "$dst" ] && continue;
    magick "$src" -scale 50% -quality ${q}% "$dst";
done
'''

os.makedirs(OUTPUT_DIR, exist_ok=True)

def compress_tiff_to_jpeg(filename):
    """Convert a TIFF image to JPEG optimized for text/graphics."""
    input_path = os.path.join(INPUT_DIR, filename)
    output_name = os.path.splitext(filename)[0] + ".jpg"
    output_path = os.path.join(OUTPUT_DIR, output_name)

    if os.path.exists(output_path):
        return f"keeping {output_path}"

    try:
        with Image.open(input_path) as img:
            if img.mode not in ("RGB", "L"):
                img = img.convert("RGB")

            if SCALE_FACTOR != 1.0:
                new_size = (
                    int(img.width * SCALE_FACTOR),
                    int(img.height * SCALE_FACTOR)
                )
                img = img.resize(new_size, Image.LANCZOS)

            img.save(
                output_path,
                format="JPEG",
                quality=QUALITY,
            )
        return f"writing {output_path}"
    except Exception as e:
        raise
        return f"error {filename}: {e}"

def main():
    tiff_files = sorted(
        f for f in os.listdir(INPUT_DIR) if f.lower().endswith((".tif", ".tiff"))
    )
    if not tiff_files:
        print("no TIFF files found.")
        return

    print(f"processing {len(tiff_files)} files using {MAX_WORKERS} workers...")

    with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {executor.submit(compress_tiff_to_jpeg, f): f for f in tiff_files}
        for future in as_completed(futures):
            print(future.result())

if __name__ == "__main__":
    main()

The intended workflow

my workflow:

1. scan book pages to 600 dpi tiff images
crop, level, deskew the tiff images

2. run tesseract on the tiff images to produce hocr files

3. publish the hocr files in a git repo to track the progress of proofreading

4. run hocr-to-epub-fxl to convert hocr files to a 300 dpi fixed-layout epub file
hocr-to-epub-fxl --output out.epub --scale 0.5 --quality 30 ./ocr/*.hocr
this allows incremental updates along the proofreading process


now the missing part is where i convert hocr files to a pdf file

this works with my hocr2pdf.py but compression should be better
(133MB versus 554MB)

expected interface:
ocrmypdf --output out.pdf --scale 0.5 --quality 30 ./ocr/*.hocr

the input type should be detected from the .hocr file extension

image paths should be parsed from the hocr files
<div class='ocr_page' title='image "../070-deskew/001.tiff"'>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants