allow specifying conversion resolution, drop explicit dpi
This commit is contained in:
@@ -5,6 +5,7 @@ from __future__ import annotations
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
import logging
|
||||
import math
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
from typing import Iterable
|
||||
@@ -27,6 +28,12 @@ class SlideDeckResult:
|
||||
|
||||
source_filename: str
|
||||
slides: list[SlideArtifact]
|
||||
width: int
|
||||
height: int
|
||||
inferred_dpi: int
|
||||
pptx_to_pdf_timeout_s: int
|
||||
pdf_to_images_timeout_s: int
|
||||
pdf_to_images_page_timeout_s: int
|
||||
|
||||
|
||||
ProgressCallback = Callable[[str, int, int], None]
|
||||
@@ -36,6 +43,21 @@ PHASE_EXTRACTING_NOTES = "extracting_notes"
|
||||
PHASE_PPTX_TO_PDF = "pptx_to_pdf"
|
||||
PHASE_PDF_TO_IMAGES = "pdf_to_images"
|
||||
|
||||
RESOLUTION_SD = "sd"
|
||||
RESOLUTION_HD = "hd"
|
||||
RESOLUTION_FHD = "fhd"
|
||||
RESOLUTION_QHD = "qhd"
|
||||
RESOLUTION_UHD = "uhd"
|
||||
|
||||
_SHORT_EDGE_PIXELS_BY_RESOLUTION = {
|
||||
RESOLUTION_SD: 480,
|
||||
RESOLUTION_HD: 720,
|
||||
RESOLUTION_FHD: 1080,
|
||||
RESOLUTION_QHD: 1440,
|
||||
RESOLUTION_UHD: 2160,
|
||||
}
|
||||
_EMU_PER_INCH = 914400
|
||||
|
||||
logger = logging.getLogger("uvicorn.error")
|
||||
|
||||
|
||||
@@ -108,6 +130,8 @@ def render_pdf_to_images(
|
||||
out_dir: Path,
|
||||
*,
|
||||
dpi: int = 72,
|
||||
target_width: int | None = None,
|
||||
target_height: int | None = None,
|
||||
image_format: str = "png",
|
||||
timeout_s: int = 120,
|
||||
total_pages: int | None = None,
|
||||
@@ -132,14 +156,24 @@ def render_pdf_to_images(
|
||||
"""
|
||||
if not pdf_path.exists():
|
||||
raise FileNotFoundError(f"source PDF does not exist: {pdf_path}")
|
||||
if (target_width is None) != (target_height is None):
|
||||
raise ValueError("target_width and target_height must be provided together")
|
||||
if target_width is not None and target_width <= 0:
|
||||
raise ValueError("target_width must be greater than zero")
|
||||
if target_height is not None and target_height <= 0:
|
||||
raise ValueError("target_height must be greater than zero")
|
||||
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
scale_args: list[str] = []
|
||||
if target_width is not None and target_height is not None:
|
||||
scale_args = ["-scale-to-x", str(target_width), "-scale-to-y", str(target_height)]
|
||||
if total_pages is None:
|
||||
prefix_path = out_dir / "slide"
|
||||
command = [
|
||||
"pdftoppm",
|
||||
"-r",
|
||||
str(dpi),
|
||||
*scale_args,
|
||||
f"-{image_format}",
|
||||
str(pdf_path.resolve()),
|
||||
str(prefix_path),
|
||||
@@ -156,7 +190,7 @@ def render_pdf_to_images(
|
||||
message = (
|
||||
"Poppler rasterization timed out after "
|
||||
f"{timeout_s} seconds while rendering {pdf_path.name}; "
|
||||
"increase conversion PDF render timeout cap or lower image DPI"
|
||||
"increase conversion PDF render timeout cap or lower output resolution"
|
||||
)
|
||||
logger.error(message, exc_info=True)
|
||||
raise ConversionTimeoutError(message) from exc
|
||||
@@ -175,6 +209,7 @@ def render_pdf_to_images(
|
||||
"pdftoppm",
|
||||
"-r",
|
||||
str(dpi),
|
||||
*scale_args,
|
||||
f"-{image_format}",
|
||||
"-f",
|
||||
str(page_index),
|
||||
@@ -202,7 +237,7 @@ def render_pdf_to_images(
|
||||
message = (
|
||||
"Poppler rasterization timed out while rendering page "
|
||||
f"{page_index}/{total_pages} of {pdf_path.name}; "
|
||||
f"{timeout_context}. Increase timeout settings or lower image DPI."
|
||||
f"{timeout_context}. Increase timeout settings or lower output resolution."
|
||||
)
|
||||
logger.error(message, exc_info=True)
|
||||
raise ConversionTimeoutError(message) from exc
|
||||
@@ -254,7 +289,7 @@ def convert_pptx_to_slidedeck(
|
||||
pptx_path: Path,
|
||||
work_dir: Path,
|
||||
*,
|
||||
dpi: int = 72,
|
||||
resolution: str = RESOLUTION_FHD,
|
||||
image_format: str = "png",
|
||||
pptx_to_pdf_timeout_s: int = 180,
|
||||
pdf_to_images_timeout_s: int = 1800,
|
||||
@@ -273,7 +308,7 @@ def convert_pptx_to_slidedeck(
|
||||
Args:
|
||||
pptx_path: Source `.pptx` path.
|
||||
work_dir: Scratch directory for generated outputs.
|
||||
dpi: Rasterization DPI for output slide images.
|
||||
resolution: Output resolution preset (`sd`, `hd`, `fhd`, `qhd`, `uhd`).
|
||||
image_format: Output image format accepted by `pdftoppm`.
|
||||
pptx_to_pdf_timeout_s: Timeout in seconds for the LibreOffice subprocess.
|
||||
pdf_to_images_timeout_s: Timeout in seconds for the Poppler subprocess.
|
||||
@@ -292,6 +327,18 @@ def convert_pptx_to_slidedeck(
|
||||
_emit_progress(progress_callback, PHASE_EXTRACTING_NOTES, 0, 1)
|
||||
notes = extract_slide_notes(pptx_path)
|
||||
_emit_progress(progress_callback, PHASE_EXTRACTING_NOTES, 1, 1)
|
||||
slide_width, slide_height = _read_slide_size_emu(pptx_path)
|
||||
output_width, output_height = _infer_output_dimensions_from_slide_size(
|
||||
slide_width=slide_width,
|
||||
slide_height=slide_height,
|
||||
resolution=resolution,
|
||||
)
|
||||
inferred_dpi = infer_minimum_raster_dpi(
|
||||
slide_width_emu=slide_width,
|
||||
slide_height_emu=slide_height,
|
||||
output_width_px=output_width,
|
||||
output_height_px=output_height,
|
||||
)
|
||||
slide_count = len(notes)
|
||||
pptx_to_pdf_timeout = _compute_adaptive_timeout(
|
||||
slide_count=slide_count,
|
||||
@@ -317,12 +364,16 @@ def convert_pptx_to_slidedeck(
|
||||
base_timeout_s=pdf_to_images_base_timeout_s,
|
||||
)
|
||||
logger.info(
|
||||
"Conversion plan source=%s slides=%d dpi=%d image_format=%s "
|
||||
"Conversion plan source=%s slides=%d inferred_dpi=%d image_format=%s "
|
||||
"resolution=%s output_size=%dx%d "
|
||||
"computed_timeouts_s[pptx_to_pdf_total=%d,pdf_to_images_total=%d,pdf_to_images_per_page=%d]",
|
||||
pptx_path.name,
|
||||
slide_count,
|
||||
dpi,
|
||||
inferred_dpi,
|
||||
image_format,
|
||||
resolution,
|
||||
output_width,
|
||||
output_height,
|
||||
pptx_to_pdf_timeout,
|
||||
pdf_to_images_timeout,
|
||||
pdf_to_images_page_timeout,
|
||||
@@ -330,7 +381,9 @@ def convert_pptx_to_slidedeck(
|
||||
image_paths = render_pdf_to_images(
|
||||
pdf_path,
|
||||
image_dir,
|
||||
dpi=dpi,
|
||||
dpi=inferred_dpi,
|
||||
target_width=output_width,
|
||||
target_height=output_height,
|
||||
image_format=image_format,
|
||||
timeout_s=pdf_to_images_page_timeout,
|
||||
total_pages=slide_count,
|
||||
@@ -353,7 +406,82 @@ def convert_pptx_to_slidedeck(
|
||||
SlideArtifact(index=index, image_path=image_path, notes_plain=note)
|
||||
for index, (image_path, note) in enumerate(zip(image_paths, notes), start=1)
|
||||
]
|
||||
return SlideDeckResult(source_filename=pptx_path.name, slides=slides)
|
||||
return SlideDeckResult(
|
||||
source_filename=pptx_path.name,
|
||||
slides=slides,
|
||||
width=output_width,
|
||||
height=output_height,
|
||||
inferred_dpi=inferred_dpi,
|
||||
pptx_to_pdf_timeout_s=pptx_to_pdf_timeout,
|
||||
pdf_to_images_timeout_s=pdf_to_images_timeout,
|
||||
pdf_to_images_page_timeout_s=pdf_to_images_page_timeout,
|
||||
)
|
||||
|
||||
|
||||
def infer_output_dimensions_for_resolution(
|
||||
pptx_path: Path,
|
||||
*,
|
||||
resolution: str,
|
||||
) -> tuple[int, int]:
|
||||
"""Infer output image dimensions from source slide aspect ratio and preset."""
|
||||
slide_width, slide_height = _read_slide_size_emu(pptx_path)
|
||||
return _infer_output_dimensions_from_slide_size(
|
||||
slide_width=slide_width,
|
||||
slide_height=slide_height,
|
||||
resolution=resolution,
|
||||
)
|
||||
|
||||
|
||||
def infer_minimum_raster_dpi(
|
||||
*,
|
||||
slide_width_emu: int,
|
||||
slide_height_emu: int,
|
||||
output_width_px: int,
|
||||
output_height_px: int,
|
||||
) -> int:
|
||||
"""Compute the minimum DPI needed to reach target output dimensions."""
|
||||
if slide_width_emu <= 0 or slide_height_emu <= 0:
|
||||
raise ValueError("source slide dimensions must be greater than zero")
|
||||
if output_width_px <= 0 or output_height_px <= 0:
|
||||
raise ValueError("output dimensions must be greater than zero")
|
||||
dpi_for_width = (output_width_px * _EMU_PER_INCH) / slide_width_emu
|
||||
dpi_for_height = (output_height_px * _EMU_PER_INCH) / slide_height_emu
|
||||
return max(1, math.ceil(max(dpi_for_width, dpi_for_height)))
|
||||
|
||||
|
||||
def _read_slide_size_emu(pptx_path: Path) -> tuple[int, int]:
|
||||
"""Read presentation slide size in English Metric Units (EMU)."""
|
||||
if not pptx_path.exists():
|
||||
raise FileNotFoundError(f"source PPTX does not exist: {pptx_path}")
|
||||
presentation = Presentation(str(pptx_path.resolve()))
|
||||
# Canonical python-pptx API on `Presentation` object.
|
||||
slide_width = presentation.slide_width
|
||||
slide_height = presentation.slide_height
|
||||
if slide_width is None or slide_height is None:
|
||||
raise ValueError("source presentation did not define slide dimensions")
|
||||
slide_width = int(slide_width)
|
||||
slide_height = int(slide_height)
|
||||
if slide_width <= 0 or slide_height <= 0:
|
||||
raise ValueError("source slide dimensions must be greater than zero")
|
||||
return slide_width, slide_height
|
||||
|
||||
|
||||
def _infer_output_dimensions_from_slide_size(
|
||||
*,
|
||||
slide_width: int,
|
||||
slide_height: int,
|
||||
resolution: str,
|
||||
) -> tuple[int, int]:
|
||||
"""Infer output dimensions from slide size and short-edge preset."""
|
||||
normalized = resolution.strip().lower()
|
||||
short_edge_pixels = _SHORT_EDGE_PIXELS_BY_RESOLUTION.get(normalized)
|
||||
if short_edge_pixels is None:
|
||||
raise ValueError(f"unsupported resolution preset: {resolution}")
|
||||
if slide_width >= slide_height:
|
||||
long_edge = max(1, round(short_edge_pixels * (slide_width / slide_height)))
|
||||
return long_edge, short_edge_pixels
|
||||
long_edge = max(1, round(short_edge_pixels * (slide_height / slide_width)))
|
||||
return short_edge_pixels, long_edge
|
||||
|
||||
|
||||
def _compute_adaptive_timeout(
|
||||
|
||||
Reference in New Issue
Block a user