add detailed jpg quality opts & thumbnail pass
Docker server image / build-and-push (push) Successful in 3m48s

This commit is contained in:
2026-03-30 05:05:27 -07:00
parent 72d4d521e3
commit 30cbfaadad
10 changed files with 644 additions and 190 deletions
@@ -19,6 +19,7 @@ class SlideArtifact:
index: int
image_path: Path
thumbnail_path: Path
notes_plain: str
@@ -30,6 +31,8 @@ class SlideDeckResult:
slides: list[SlideArtifact]
width: int
height: int
thumbnail_width: int
thumbnail_height: int
inferred_dpi: int
pptx_to_pdf_timeout_s: int
pdf_to_images_timeout_s: int
@@ -133,6 +136,7 @@ def render_pdf_to_images(
target_width: int | None = None,
target_height: int | None = None,
image_format: str = "png",
jpeg_quality: int | None = None,
timeout_s: int = 120,
total_pages: int | None = None,
operation_timeout_s: int | None = None,
@@ -162,6 +166,15 @@ def render_pdf_to_images(
raise ValueError("target_width must be greater than zero")
if target_height is not None and target_height <= 0:
raise ValueError("target_height must be greater than zero")
normalized_image_format = _normalize_image_format(image_format)
output_suffix = _image_output_suffix(normalized_image_format)
normalized_jpeg_quality = _normalize_jpeg_quality(jpeg_quality)
jpeg_options_args: list[str] = []
if normalized_jpeg_quality is not None and normalized_image_format in {
"jpeg",
"jpegcmyk",
}:
jpeg_options_args = ["-jpegopt", f"quality={normalized_jpeg_quality}"]
out_dir.mkdir(parents=True, exist_ok=True)
scale_args: list[str] = []
@@ -174,7 +187,8 @@ def render_pdf_to_images(
"-r",
str(dpi),
*scale_args,
f"-{image_format}",
*jpeg_options_args,
f"-{normalized_image_format}",
str(pdf_path.resolve()),
str(prefix_path),
]
@@ -198,7 +212,7 @@ def render_pdf_to_images(
raise RuntimeError(
f"Poppler rasterization failed: {completed.stderr.strip() or completed.stdout.strip()}"
)
images = sorted(out_dir.glob(f"slide-*.{image_format}"))
images = sorted(out_dir.glob(f"slide-*.{output_suffix}"))
else:
if total_pages < 0:
raise ValueError("total_pages must be zero or greater")
@@ -210,7 +224,8 @@ def render_pdf_to_images(
"-r",
str(dpi),
*scale_args,
f"-{image_format}",
*jpeg_options_args,
f"-{normalized_image_format}",
"-f",
str(page_index),
"-l",
@@ -246,7 +261,7 @@ def render_pdf_to_images(
"Poppler rasterization failed on page "
f"{page_index}: {completed.stderr.strip() or completed.stdout.strip()}"
)
image_path = page_prefix.with_suffix(f".{image_format}")
image_path = page_prefix.with_suffix(f".{output_suffix}")
if not image_path.exists():
raise RuntimeError(
f"Poppler did not create expected page image: {image_path}"
@@ -289,8 +304,10 @@ def convert_pptx_to_slidedeck(
pptx_path: Path,
work_dir: Path,
*,
resolution: str = RESOLUTION_FHD,
image_format: str = "png",
full_resolution: str = RESOLUTION_FHD,
thumbnail_resolution: str = RESOLUTION_SD,
full_jpeg_quality: int | None = None,
thumbnail_jpeg_quality: int | None = None,
pptx_to_pdf_timeout_s: int = 180,
pdf_to_images_timeout_s: int = 1800,
pptx_to_pdf_base_timeout_s: int = 45,
@@ -308,8 +325,10 @@ def convert_pptx_to_slidedeck(
Args:
pptx_path: Source `.pptx` path.
work_dir: Scratch directory for generated outputs.
resolution: Output resolution preset (`sd`, `hd`, `fhd`, `qhd`, `uhd`).
image_format: Output image format accepted by `pdftoppm`.
full_resolution: Full-size output resolution preset (`sd`, `hd`, `fhd`, `qhd`, `uhd`).
thumbnail_resolution: Thumbnail output resolution preset (`sd`, `hd`, `fhd`, `qhd`, `uhd`).
full_jpeg_quality: Full-size JPEG quality (`1..100`); `None`/`0` means default.
thumbnail_jpeg_quality: Thumbnail JPEG quality (`1..100`); `None`/`0` means default.
pptx_to_pdf_timeout_s: Timeout in seconds for the LibreOffice subprocess.
pdf_to_images_timeout_s: Timeout in seconds for the Poppler subprocess.
@@ -322,7 +341,8 @@ def convert_pptx_to_slidedeck(
work_dir = work_dir.resolve()
work_dir.mkdir(parents=True, exist_ok=True)
pdf_path = work_dir / f"{pptx_path.stem}.pdf"
image_dir = work_dir / "slides"
full_image_dir = work_dir / "slides_full"
thumbnail_image_dir = work_dir / "slides_thumb"
_emit_progress(progress_callback, PHASE_EXTRACTING_NOTES, 0, 1)
notes = extract_slide_notes(pptx_path)
@@ -331,7 +351,12 @@ def convert_pptx_to_slidedeck(
output_width, output_height = _infer_output_dimensions_from_slide_size(
slide_width=slide_width,
slide_height=slide_height,
resolution=resolution,
resolution=full_resolution,
)
thumbnail_width, thumbnail_height = _infer_output_dimensions_from_slide_size(
slide_width=slide_width,
slide_height=slide_height,
resolution=thumbnail_resolution,
)
inferred_dpi = infer_minimum_raster_dpi(
slide_width_emu=slide_width,
@@ -339,6 +364,12 @@ def convert_pptx_to_slidedeck(
output_width_px=output_width,
output_height_px=output_height,
)
thumbnail_inferred_dpi = infer_minimum_raster_dpi(
slide_width_emu=slide_width,
slide_height_emu=slide_height,
output_width_px=thumbnail_width,
output_height_px=thumbnail_height,
)
slide_count = len(notes)
pptx_to_pdf_timeout = _compute_adaptive_timeout(
slide_count=slide_count,
@@ -347,7 +378,7 @@ def convert_pptx_to_slidedeck(
per_slide_timeout_s=pptx_to_pdf_per_slide_timeout_s,
)
pdf_to_images_timeout = _compute_adaptive_timeout(
slide_count=slide_count,
slide_count=slide_count * 2,
timeout_cap_s=pdf_to_images_timeout_s,
base_timeout_s=pdf_to_images_base_timeout_s,
per_slide_timeout_s=pdf_to_images_per_slide_timeout_s,
@@ -357,34 +388,43 @@ def convert_pptx_to_slidedeck(
convert_pptx_to_pdf(pptx_path, pdf_path, timeout_s=pptx_to_pdf_timeout)
_emit_progress(progress_callback, PHASE_PPTX_TO_PDF, 1, 1)
_emit_progress(progress_callback, PHASE_PDF_TO_IMAGES, 0, slide_count)
raster_steps = slide_count * 2
_emit_progress(progress_callback, PHASE_PDF_TO_IMAGES, 0, raster_steps)
pdf_to_images_page_timeout = _compute_page_timeout(
total_timeout_s=pdf_to_images_timeout,
page_count=slide_count,
page_count=raster_steps,
base_timeout_s=pdf_to_images_base_timeout_s,
)
logger.info(
"Conversion plan source=%s slides=%d inferred_dpi=%d image_format=%s "
"resolution=%s output_size=%dx%d "
"Conversion plan source=%s slides=%d image_format=%s "
"full[resolution=%s size=%dx%d dpi=%d jpeg_quality=%s] "
"thumbnail[resolution=%s size=%dx%d dpi=%d jpeg_quality=%s] "
"computed_timeouts_s[pptx_to_pdf_total=%d,pdf_to_images_total=%d,pdf_to_images_per_page=%d]",
pptx_path.name,
slide_count,
inferred_dpi,
image_format,
resolution,
"jpeg",
full_resolution,
output_width,
output_height,
inferred_dpi,
str(full_jpeg_quality if full_jpeg_quality is not None else "default"),
thumbnail_resolution,
thumbnail_width,
thumbnail_height,
thumbnail_inferred_dpi,
str(thumbnail_jpeg_quality if thumbnail_jpeg_quality is not None else "default"),
pptx_to_pdf_timeout,
pdf_to_images_timeout,
pdf_to_images_page_timeout,
)
image_paths = render_pdf_to_images(
full_image_paths = render_pdf_to_images(
pdf_path,
image_dir,
full_image_dir,
dpi=inferred_dpi,
target_width=output_width,
target_height=output_height,
image_format=image_format,
image_format="jpeg",
jpeg_quality=full_jpeg_quality,
timeout_s=pdf_to_images_page_timeout,
total_pages=slide_count,
operation_timeout_s=pdf_to_images_timeout,
@@ -395,22 +435,55 @@ def convert_pptx_to_slidedeck(
max_pages,
),
)
thumbnail_image_paths = render_pdf_to_images(
pdf_path,
thumbnail_image_dir,
dpi=thumbnail_inferred_dpi,
target_width=thumbnail_width,
target_height=thumbnail_height,
image_format="jpeg",
jpeg_quality=thumbnail_jpeg_quality,
timeout_s=pdf_to_images_page_timeout,
total_pages=slide_count,
operation_timeout_s=pdf_to_images_timeout,
page_progress_callback=lambda current, max_pages: _emit_progress(
progress_callback,
PHASE_PDF_TO_IMAGES,
slide_count + current,
slide_count + max_pages,
),
)
if len(image_paths) != len(notes):
if len(full_image_paths) != len(notes):
raise ValueError(
"rendered slide count does not match note count: "
f"{len(image_paths)} image(s) vs {len(notes)} note entries"
"rendered full-size slide count does not match note count: "
f"{len(full_image_paths)} image(s) vs {len(notes)} note entries"
)
if len(thumbnail_image_paths) != len(notes):
raise ValueError(
"rendered thumbnail slide count does not match note count: "
f"{len(thumbnail_image_paths)} image(s) vs {len(notes)} note entries"
)
slides = [
SlideArtifact(index=index, image_path=image_path, notes_plain=note)
for index, (image_path, note) in enumerate(zip(image_paths, notes), start=1)
SlideArtifact(
index=index,
image_path=image_path,
thumbnail_path=thumbnail_path,
notes_plain=note,
)
for index, (image_path, thumbnail_path, note) in enumerate(
zip(full_image_paths, thumbnail_image_paths, notes),
start=1,
)
]
return SlideDeckResult(
source_filename=pptx_path.name,
slides=slides,
width=output_width,
height=output_height,
thumbnail_width=thumbnail_width,
thumbnail_height=thumbnail_height,
inferred_dpi=inferred_dpi,
pptx_to_pdf_timeout_s=pptx_to_pdf_timeout,
pdf_to_images_timeout_s=pdf_to_images_timeout,
@@ -511,6 +584,30 @@ def _compute_page_timeout(
return max(base_timeout_s, timeout)
def _normalize_image_format(image_format: str) -> str:
"""Normalize image format aliases accepted by Poppler."""
normalized = image_format.strip().lower()
if normalized == "jpg":
return "jpeg"
return normalized
def _image_output_suffix(image_format: str) -> str:
"""Return output filename suffix for a normalized Poppler image format."""
if image_format in {"jpeg", "jpegcmyk"}:
return "jpg"
return image_format
def _normalize_jpeg_quality(jpeg_quality: int | None) -> int | None:
"""Normalize optional JPEG quality; 0 means server/default quality."""
if jpeg_quality is None or jpeg_quality == 0:
return None
if 1 <= jpeg_quality <= 100:
return jpeg_quality
raise ValueError("jpeg_quality must be 0 or between 1 and 100")
def _emit_progress(
progress_callback: ProgressCallback | None,
phase: str,