add conversion phase & progress metrics, timeout heuristics
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
@@ -27,6 +28,14 @@ class SlideDeckResult:
|
||||
slides: list[SlideArtifact]
|
||||
|
||||
|
||||
ProgressCallback = Callable[[str, int, int], None]
|
||||
PageProgressCallback = Callable[[int, int], None]
|
||||
|
||||
PHASE_EXTRACTING_NOTES = "extracting_notes"
|
||||
PHASE_PPTX_TO_PDF = "pptx_to_pdf"
|
||||
PHASE_PDF_TO_IMAGES = "pdf_to_images"
|
||||
|
||||
|
||||
def convert_pptx_to_pdf(pptx_path: Path, pdf_path: Path, *, timeout_s: int = 120) -> Path:
|
||||
"""Convert a PPTX file to PDF using headless LibreOffice.
|
||||
|
||||
@@ -92,6 +101,8 @@ def render_pdf_to_images(
|
||||
dpi: int = 180,
|
||||
image_format: str = "png",
|
||||
timeout_s: int = 120,
|
||||
total_pages: int | None = None,
|
||||
page_progress_callback: PageProgressCallback | None = None,
|
||||
) -> list[Path]:
|
||||
"""Render each PDF page into an image using Poppler's `pdftoppm`.
|
||||
|
||||
@@ -113,35 +124,83 @@ def render_pdf_to_images(
|
||||
raise FileNotFoundError(f"source PDF does not exist: {pdf_path}")
|
||||
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
prefix_path = out_dir / "slide"
|
||||
command = [
|
||||
"pdftoppm",
|
||||
"-r",
|
||||
str(dpi),
|
||||
f"-{image_format}",
|
||||
str(pdf_path.resolve()),
|
||||
str(prefix_path),
|
||||
]
|
||||
try:
|
||||
completed = subprocess.run(
|
||||
command,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout_s,
|
||||
)
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
raise RuntimeError(
|
||||
"Poppler rasterization timed out after "
|
||||
f"{timeout_s} seconds while rendering {pdf_path.name}; "
|
||||
"increase conversion PDF render timeout or lower image DPI"
|
||||
) from exc
|
||||
if completed.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"Poppler rasterization failed: {completed.stderr.strip() or completed.stdout.strip()}"
|
||||
)
|
||||
if total_pages is None:
|
||||
prefix_path = out_dir / "slide"
|
||||
command = [
|
||||
"pdftoppm",
|
||||
"-r",
|
||||
str(dpi),
|
||||
f"-{image_format}",
|
||||
str(pdf_path.resolve()),
|
||||
str(prefix_path),
|
||||
]
|
||||
try:
|
||||
completed = subprocess.run(
|
||||
command,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout_s,
|
||||
)
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
raise RuntimeError(
|
||||
"Poppler rasterization timed out after "
|
||||
f"{timeout_s} seconds while rendering {pdf_path.name}; "
|
||||
"increase conversion PDF render timeout or lower image DPI"
|
||||
) from exc
|
||||
if completed.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"Poppler rasterization failed: {completed.stderr.strip() or completed.stdout.strip()}"
|
||||
)
|
||||
images = sorted(out_dir.glob(f"slide-*.{image_format}"))
|
||||
else:
|
||||
if total_pages < 0:
|
||||
raise ValueError("total_pages must be zero or greater")
|
||||
images = []
|
||||
for page_index in range(1, total_pages + 1):
|
||||
page_prefix = out_dir / f"slide-{page_index:04d}"
|
||||
command = [
|
||||
"pdftoppm",
|
||||
"-r",
|
||||
str(dpi),
|
||||
f"-{image_format}",
|
||||
"-f",
|
||||
str(page_index),
|
||||
"-l",
|
||||
str(page_index),
|
||||
"-singlefile",
|
||||
str(pdf_path.resolve()),
|
||||
str(page_prefix),
|
||||
]
|
||||
try:
|
||||
completed = subprocess.run(
|
||||
command,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout_s,
|
||||
)
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
raise RuntimeError(
|
||||
"Poppler rasterization timed out after "
|
||||
f"{timeout_s} seconds while rendering page {page_index} "
|
||||
f"of {pdf_path.name}; increase conversion PDF render timeout "
|
||||
"or lower image DPI"
|
||||
) from exc
|
||||
if completed.returncode != 0:
|
||||
raise RuntimeError(
|
||||
"Poppler rasterization failed on page "
|
||||
f"{page_index}: {completed.stderr.strip() or completed.stdout.strip()}"
|
||||
)
|
||||
image_path = page_prefix.with_suffix(f".{image_format}")
|
||||
if not image_path.exists():
|
||||
raise RuntimeError(
|
||||
f"Poppler did not create expected page image: {image_path}"
|
||||
)
|
||||
images.append(image_path.resolve())
|
||||
if page_progress_callback is not None:
|
||||
page_progress_callback(page_index, total_pages)
|
||||
|
||||
images = sorted(out_dir.glob(f"slide-*.{image_format}"))
|
||||
if not images:
|
||||
raise RuntimeError(f"no rendered images found in {out_dir}")
|
||||
return [image.resolve() for image in images]
|
||||
@@ -180,6 +239,11 @@ def convert_pptx_to_slidedeck(
|
||||
image_format: str = "png",
|
||||
pptx_to_pdf_timeout_s: int = 180,
|
||||
pdf_to_images_timeout_s: int = 600,
|
||||
pptx_to_pdf_base_timeout_s: int = 45,
|
||||
pptx_to_pdf_per_slide_timeout_s: int = 3,
|
||||
pdf_to_images_base_timeout_s: int = 30,
|
||||
pdf_to_images_per_slide_timeout_s: int = 8,
|
||||
progress_callback: ProgressCallback | None = None,
|
||||
) -> SlideDeckResult:
|
||||
"""Convert a PPTX into rendered images and extracted notes.
|
||||
|
||||
@@ -206,15 +270,45 @@ def convert_pptx_to_slidedeck(
|
||||
pdf_path = work_dir / f"{pptx_path.stem}.pdf"
|
||||
image_dir = work_dir / "slides"
|
||||
|
||||
convert_pptx_to_pdf(pptx_path, pdf_path, timeout_s=pptx_to_pdf_timeout_s)
|
||||
_emit_progress(progress_callback, PHASE_EXTRACTING_NOTES, 0, 1)
|
||||
notes = extract_slide_notes(pptx_path)
|
||||
_emit_progress(progress_callback, PHASE_EXTRACTING_NOTES, 1, 1)
|
||||
slide_count = len(notes)
|
||||
pptx_to_pdf_timeout = _compute_adaptive_timeout(
|
||||
slide_count=slide_count,
|
||||
timeout_cap_s=pptx_to_pdf_timeout_s,
|
||||
base_timeout_s=pptx_to_pdf_base_timeout_s,
|
||||
per_slide_timeout_s=pptx_to_pdf_per_slide_timeout_s,
|
||||
)
|
||||
pdf_to_images_timeout = _compute_adaptive_timeout(
|
||||
slide_count=slide_count,
|
||||
timeout_cap_s=pdf_to_images_timeout_s,
|
||||
base_timeout_s=pdf_to_images_base_timeout_s,
|
||||
per_slide_timeout_s=pdf_to_images_per_slide_timeout_s,
|
||||
)
|
||||
|
||||
_emit_progress(progress_callback, PHASE_PPTX_TO_PDF, 0, 1)
|
||||
convert_pptx_to_pdf(pptx_path, pdf_path, timeout_s=pptx_to_pdf_timeout)
|
||||
_emit_progress(progress_callback, PHASE_PPTX_TO_PDF, 1, 1)
|
||||
|
||||
_emit_progress(progress_callback, PHASE_PDF_TO_IMAGES, 0, slide_count)
|
||||
image_paths = render_pdf_to_images(
|
||||
pdf_path,
|
||||
image_dir,
|
||||
dpi=dpi,
|
||||
image_format=image_format,
|
||||
timeout_s=pdf_to_images_timeout_s,
|
||||
timeout_s=_compute_page_timeout(
|
||||
total_timeout_s=pdf_to_images_timeout,
|
||||
page_count=slide_count,
|
||||
),
|
||||
total_pages=slide_count,
|
||||
page_progress_callback=lambda current, max_pages: _emit_progress(
|
||||
progress_callback,
|
||||
PHASE_PDF_TO_IMAGES,
|
||||
current,
|
||||
max_pages,
|
||||
),
|
||||
)
|
||||
notes = extract_slide_notes(pptx_path)
|
||||
|
||||
if len(image_paths) != len(notes):
|
||||
raise ValueError(
|
||||
@@ -229,6 +323,40 @@ def convert_pptx_to_slidedeck(
|
||||
return SlideDeckResult(source_filename=pptx_path.name, slides=slides)
|
||||
|
||||
|
||||
def _compute_adaptive_timeout(
|
||||
*,
|
||||
slide_count: int,
|
||||
timeout_cap_s: int,
|
||||
base_timeout_s: int,
|
||||
per_slide_timeout_s: int,
|
||||
) -> int:
|
||||
"""Compute a bounded timeout that scales linearly with slide count."""
|
||||
normalized_slides = max(1, slide_count)
|
||||
adaptive_timeout = base_timeout_s + (normalized_slides * per_slide_timeout_s)
|
||||
bounded_timeout = min(timeout_cap_s, adaptive_timeout)
|
||||
return max(1, bounded_timeout)
|
||||
|
||||
|
||||
def _compute_page_timeout(*, total_timeout_s: int, page_count: int) -> int:
|
||||
"""Split total PDF raster timeout into a bounded per-page timeout."""
|
||||
if page_count <= 0:
|
||||
return max(1, total_timeout_s)
|
||||
timeout = (total_timeout_s + page_count - 1) // page_count
|
||||
return max(15, timeout)
|
||||
|
||||
|
||||
def _emit_progress(
|
||||
progress_callback: ProgressCallback | None,
|
||||
phase: str,
|
||||
current_progress: int,
|
||||
max_progress: int,
|
||||
) -> None:
|
||||
"""Emit phase/progress updates when a callback is configured."""
|
||||
if progress_callback is None:
|
||||
return
|
||||
progress_callback(phase, current_progress, max_progress)
|
||||
|
||||
|
||||
def _extract_notes_text(shapes: Iterable[object]) -> str:
|
||||
"""Extract plain text from note shapes while preserving paragraph breaks."""
|
||||
segments: list[str] = []
|
||||
|
||||
Reference in New Issue
Block a user