add conversion phase & progress metrics, timeout heuristics

This commit is contained in:
2026-03-26 23:30:25 -07:00
parent 26452aa57c
commit baf87ee195
12 changed files with 468 additions and 96 deletions
@@ -2,6 +2,7 @@
from __future__ import annotations
from collections.abc import Callable
from dataclasses import dataclass
from pathlib import Path
import subprocess
@@ -27,6 +28,14 @@ class SlideDeckResult:
slides: list[SlideArtifact]
ProgressCallback = Callable[[str, int, int], None]
PageProgressCallback = Callable[[int, int], None]
PHASE_EXTRACTING_NOTES = "extracting_notes"
PHASE_PPTX_TO_PDF = "pptx_to_pdf"
PHASE_PDF_TO_IMAGES = "pdf_to_images"
def convert_pptx_to_pdf(pptx_path: Path, pdf_path: Path, *, timeout_s: int = 120) -> Path:
"""Convert a PPTX file to PDF using headless LibreOffice.
@@ -92,6 +101,8 @@ def render_pdf_to_images(
dpi: int = 180,
image_format: str = "png",
timeout_s: int = 120,
total_pages: int | None = None,
page_progress_callback: PageProgressCallback | None = None,
) -> list[Path]:
"""Render each PDF page into an image using Poppler's `pdftoppm`.
@@ -113,35 +124,83 @@ def render_pdf_to_images(
raise FileNotFoundError(f"source PDF does not exist: {pdf_path}")
out_dir.mkdir(parents=True, exist_ok=True)
prefix_path = out_dir / "slide"
command = [
"pdftoppm",
"-r",
str(dpi),
f"-{image_format}",
str(pdf_path.resolve()),
str(prefix_path),
]
try:
completed = subprocess.run(
command,
check=False,
capture_output=True,
text=True,
timeout=timeout_s,
)
except subprocess.TimeoutExpired as exc:
raise RuntimeError(
"Poppler rasterization timed out after "
f"{timeout_s} seconds while rendering {pdf_path.name}; "
"increase conversion PDF render timeout or lower image DPI"
) from exc
if completed.returncode != 0:
raise RuntimeError(
f"Poppler rasterization failed: {completed.stderr.strip() or completed.stdout.strip()}"
)
if total_pages is None:
prefix_path = out_dir / "slide"
command = [
"pdftoppm",
"-r",
str(dpi),
f"-{image_format}",
str(pdf_path.resolve()),
str(prefix_path),
]
try:
completed = subprocess.run(
command,
check=False,
capture_output=True,
text=True,
timeout=timeout_s,
)
except subprocess.TimeoutExpired as exc:
raise RuntimeError(
"Poppler rasterization timed out after "
f"{timeout_s} seconds while rendering {pdf_path.name}; "
"increase conversion PDF render timeout or lower image DPI"
) from exc
if completed.returncode != 0:
raise RuntimeError(
f"Poppler rasterization failed: {completed.stderr.strip() or completed.stdout.strip()}"
)
images = sorted(out_dir.glob(f"slide-*.{image_format}"))
else:
if total_pages < 0:
raise ValueError("total_pages must be zero or greater")
images = []
for page_index in range(1, total_pages + 1):
page_prefix = out_dir / f"slide-{page_index:04d}"
command = [
"pdftoppm",
"-r",
str(dpi),
f"-{image_format}",
"-f",
str(page_index),
"-l",
str(page_index),
"-singlefile",
str(pdf_path.resolve()),
str(page_prefix),
]
try:
completed = subprocess.run(
command,
check=False,
capture_output=True,
text=True,
timeout=timeout_s,
)
except subprocess.TimeoutExpired as exc:
raise RuntimeError(
"Poppler rasterization timed out after "
f"{timeout_s} seconds while rendering page {page_index} "
f"of {pdf_path.name}; increase conversion PDF render timeout "
"or lower image DPI"
) from exc
if completed.returncode != 0:
raise RuntimeError(
"Poppler rasterization failed on page "
f"{page_index}: {completed.stderr.strip() or completed.stdout.strip()}"
)
image_path = page_prefix.with_suffix(f".{image_format}")
if not image_path.exists():
raise RuntimeError(
f"Poppler did not create expected page image: {image_path}"
)
images.append(image_path.resolve())
if page_progress_callback is not None:
page_progress_callback(page_index, total_pages)
images = sorted(out_dir.glob(f"slide-*.{image_format}"))
if not images:
raise RuntimeError(f"no rendered images found in {out_dir}")
return [image.resolve() for image in images]
@@ -180,6 +239,11 @@ def convert_pptx_to_slidedeck(
image_format: str = "png",
pptx_to_pdf_timeout_s: int = 180,
pdf_to_images_timeout_s: int = 600,
pptx_to_pdf_base_timeout_s: int = 45,
pptx_to_pdf_per_slide_timeout_s: int = 3,
pdf_to_images_base_timeout_s: int = 30,
pdf_to_images_per_slide_timeout_s: int = 8,
progress_callback: ProgressCallback | None = None,
) -> SlideDeckResult:
"""Convert a PPTX into rendered images and extracted notes.
@@ -206,15 +270,45 @@ def convert_pptx_to_slidedeck(
pdf_path = work_dir / f"{pptx_path.stem}.pdf"
image_dir = work_dir / "slides"
convert_pptx_to_pdf(pptx_path, pdf_path, timeout_s=pptx_to_pdf_timeout_s)
_emit_progress(progress_callback, PHASE_EXTRACTING_NOTES, 0, 1)
notes = extract_slide_notes(pptx_path)
_emit_progress(progress_callback, PHASE_EXTRACTING_NOTES, 1, 1)
slide_count = len(notes)
pptx_to_pdf_timeout = _compute_adaptive_timeout(
slide_count=slide_count,
timeout_cap_s=pptx_to_pdf_timeout_s,
base_timeout_s=pptx_to_pdf_base_timeout_s,
per_slide_timeout_s=pptx_to_pdf_per_slide_timeout_s,
)
pdf_to_images_timeout = _compute_adaptive_timeout(
slide_count=slide_count,
timeout_cap_s=pdf_to_images_timeout_s,
base_timeout_s=pdf_to_images_base_timeout_s,
per_slide_timeout_s=pdf_to_images_per_slide_timeout_s,
)
_emit_progress(progress_callback, PHASE_PPTX_TO_PDF, 0, 1)
convert_pptx_to_pdf(pptx_path, pdf_path, timeout_s=pptx_to_pdf_timeout)
_emit_progress(progress_callback, PHASE_PPTX_TO_PDF, 1, 1)
_emit_progress(progress_callback, PHASE_PDF_TO_IMAGES, 0, slide_count)
image_paths = render_pdf_to_images(
pdf_path,
image_dir,
dpi=dpi,
image_format=image_format,
timeout_s=pdf_to_images_timeout_s,
timeout_s=_compute_page_timeout(
total_timeout_s=pdf_to_images_timeout,
page_count=slide_count,
),
total_pages=slide_count,
page_progress_callback=lambda current, max_pages: _emit_progress(
progress_callback,
PHASE_PDF_TO_IMAGES,
current,
max_pages,
),
)
notes = extract_slide_notes(pptx_path)
if len(image_paths) != len(notes):
raise ValueError(
@@ -229,6 +323,40 @@ def convert_pptx_to_slidedeck(
return SlideDeckResult(source_filename=pptx_path.name, slides=slides)
def _compute_adaptive_timeout(
*,
slide_count: int,
timeout_cap_s: int,
base_timeout_s: int,
per_slide_timeout_s: int,
) -> int:
"""Compute a bounded timeout that scales linearly with slide count."""
normalized_slides = max(1, slide_count)
adaptive_timeout = base_timeout_s + (normalized_slides * per_slide_timeout_s)
bounded_timeout = min(timeout_cap_s, adaptive_timeout)
return max(1, bounded_timeout)
def _compute_page_timeout(*, total_timeout_s: int, page_count: int) -> int:
"""Split total PDF raster timeout into a bounded per-page timeout."""
if page_count <= 0:
return max(1, total_timeout_s)
timeout = (total_timeout_s + page_count - 1) // page_count
return max(15, timeout)
def _emit_progress(
progress_callback: ProgressCallback | None,
phase: str,
current_progress: int,
max_progress: int,
) -> None:
"""Emit phase/progress updates when a callback is configured."""
if progress_callback is None:
return
progress_callback(phase, current_progress, max_progress)
def _extract_notes_text(shapes: Iterable[object]) -> str:
"""Extract plain text from note shapes while preserving paragraph breaks."""
segments: list[str] = []
@@ -19,6 +19,10 @@ class ServerConfig:
conversion_image_dpi: int
conversion_pptx_to_pdf_timeout_seconds: int
conversion_pdf_to_images_timeout_seconds: int
conversion_pptx_to_pdf_base_timeout_seconds: int
conversion_pptx_to_pdf_per_slide_timeout_seconds: int
conversion_pdf_to_images_base_timeout_seconds: int
conversion_pdf_to_images_per_slide_timeout_seconds: int
conversion_cleanup_delay_seconds: int
@@ -31,13 +35,25 @@ def load_server_config() -> ServerConfig:
s3_secure=os.getenv("S3_USE_SSL", "false").lower() == "true",
s3_public_endpoint=os.getenv("S3_PUBLIC_ENDPOINT", "localhost:8333"),
s3_session_ttl_seconds=int(os.getenv("S3_SESSION_TTL_SECONDS", "3600")),
conversion_image_dpi=int(os.getenv("CONVERSION_IMAGE_DPI", "150")),
conversion_image_dpi=int(os.getenv("CONVERSION_IMAGE_DPI", "72")),
conversion_pptx_to_pdf_timeout_seconds=int(
os.getenv("CONVERSION_PPTX_TO_PDF_TIMEOUT_SECONDS", "180")
),
conversion_pdf_to_images_timeout_seconds=int(
os.getenv("CONVERSION_PDF_TO_IMAGES_TIMEOUT_SECONDS", "600")
),
conversion_pptx_to_pdf_base_timeout_seconds=int(
os.getenv("CONVERSION_PPTX_TO_PDF_BASE_TIMEOUT_SECONDS", "45")
),
conversion_pptx_to_pdf_per_slide_timeout_seconds=int(
os.getenv("CONVERSION_PPTX_TO_PDF_PER_SLIDE_TIMEOUT_SECONDS", "3")
),
conversion_pdf_to_images_base_timeout_seconds=int(
os.getenv("CONVERSION_PDF_TO_IMAGES_BASE_TIMEOUT_SECONDS", "30")
),
conversion_pdf_to_images_per_slide_timeout_seconds=int(
os.getenv("CONVERSION_PDF_TO_IMAGES_PER_SLIDE_TIMEOUT_SECONDS", "8")
),
conversion_cleanup_delay_seconds=int(
os.getenv("CONVERSION_CLEANUP_DELAY_SECONDS", "3600")
),
@@ -22,6 +22,9 @@ class ConversionSession:
bucket_name: str
upload_object_key: str
status: conversion_pb2.ConversionStatus
phase: conversion_pb2.ConversionPhase = conversion_pb2.CONVERSION_PHASE_INACTIVE
current_progress: int = 0
max_progress: int = 0
created_at: datetime = field(default_factory=utc_now)
updated_at: datetime = field(default_factory=utc_now)
error_message: str = ""
@@ -3,6 +3,7 @@
from __future__ import annotations
import asyncio
from collections.abc import Callable
from datetime import datetime, timedelta, timezone
from pathlib import Path
import shutil
@@ -14,6 +15,11 @@ from connectrpc.errors import ConnectError
from connectrpc.request import RequestContext
from google.protobuf.timestamp_pb2 import Timestamp
from officeconvert import SlideArtifact, convert_pptx_to_slidedeck
from officeconvert.conversion import (
PHASE_EXTRACTING_NOTES,
PHASE_PDF_TO_IMAGES,
PHASE_PPTX_TO_PDF,
)
from officeconvertapi.v1 import conversion_connect, conversion_pb2
from officeconvert_server.config import ServerConfig
@@ -98,6 +104,10 @@ class ConversionServiceImpl(conversion_connect.ConversionService):
)
session.status = conversion_pb2.CONVERSION_STATUS_RUNNING
session.phase = conversion_pb2.CONVERSION_PHASE_INACTIVE
session.current_progress = 0
session.max_progress = 0
session.error_message = ""
session.updated_at = utc_now()
session.conversion_task = asyncio.create_task(self._run_conversion(session))
@@ -119,6 +129,9 @@ class ConversionServiceImpl(conversion_connect.ConversionService):
status=session.status,
error_message=session.error_message,
updated_at=_to_timestamp(session.updated_at),
phase=session.phase,
current_progress=session.current_progress,
max_progress=session.max_progress,
)
async def get_slide_deck(
@@ -185,22 +198,47 @@ class ConversionServiceImpl(conversion_connect.ConversionService):
dpi=self._config.conversion_image_dpi,
pptx_to_pdf_timeout_s=self._config.conversion_pptx_to_pdf_timeout_seconds,
pdf_to_images_timeout_s=self._config.conversion_pdf_to_images_timeout_seconds,
pptx_to_pdf_base_timeout_s=self._config.conversion_pptx_to_pdf_base_timeout_seconds,
pptx_to_pdf_per_slide_timeout_s=self._config.conversion_pptx_to_pdf_per_slide_timeout_seconds,
pdf_to_images_base_timeout_s=self._config.conversion_pdf_to_images_base_timeout_seconds,
pdf_to_images_per_slide_timeout_s=self._config.conversion_pdf_to_images_per_slide_timeout_seconds,
progress_callback=lambda phase_name, current, max_value: self._set_session_progress_from_name(
session,
phase_name=phase_name,
current_progress=current,
max_progress=max_value,
),
)
self._set_session_progress(
session,
phase=conversion_pb2.CONVERSION_PHASE_UPLOADING_RESULTS,
current_progress=0,
max_progress=len(result.slides),
)
session.slide_deck = await asyncio.to_thread(
self._upload_and_build_slide_deck,
session,
result.slides,
result.source_filename,
lambda current, max_value: self._set_session_progress(
session,
phase=conversion_pb2.CONVERSION_PHASE_UPLOADING_RESULTS,
current_progress=current,
max_progress=max_value,
),
)
session.status = conversion_pb2.CONVERSION_STATUS_SUCCEEDED
session.phase = conversion_pb2.CONVERSION_PHASE_INACTIVE
session.updated_at = utc_now()
except asyncio.CancelledError:
session.status = conversion_pb2.CONVERSION_STATUS_FAILED
session.phase = conversion_pb2.CONVERSION_PHASE_INACTIVE
session.error_message = "conversion cancelled"
session.updated_at = utc_now()
raise
except Exception as exc:
session.status = conversion_pb2.CONVERSION_STATUS_FAILED
session.phase = conversion_pb2.CONVERSION_PHASE_INACTIVE
session.error_message = str(exc)
session.updated_at = utc_now()
finally:
@@ -212,10 +250,12 @@ class ConversionServiceImpl(conversion_connect.ConversionService):
session: ConversionSession,
slides: list[SlideArtifact],
source_filename: str,
progress_callback: Callable[[int, int], None] | None = None,
) -> conversion_pb2.SlideDeck:
"""Upload generated slide images and construct API response payload."""
response_slides: list[conversion_pb2.Slide] = []
for slide in slides:
slide_total = len(slides)
for slide_index, slide in enumerate(slides, start=1):
object_key = f"output/slide-{slide.index:04d}{slide.image_path.suffix}"
self._store.fput_object(session.bucket_name, object_key, slide.image_path)
image_url = self._store.presigned_get_url(
@@ -230,6 +270,8 @@ class ConversionServiceImpl(conversion_connect.ConversionService):
image_url=image_url,
)
)
if progress_callback is not None:
progress_callback(slide_index, slide_total)
return conversion_pb2.SlideDeck(
conversion_id=session.conversion_id,
@@ -263,6 +305,45 @@ class ConversionServiceImpl(conversion_connect.ConversionService):
raise ConnectError(Code.NOT_FOUND, "conversion_id not found")
return session
def _set_session_progress_from_name(
self,
session: ConversionSession,
*,
phase_name: str,
current_progress: int,
max_progress: int,
) -> None:
"""Map conversion-library phase names onto API enum phases."""
phase_map = {
PHASE_EXTRACTING_NOTES: conversion_pb2.CONVERSION_PHASE_EXTRACTING_NOTES,
PHASE_PPTX_TO_PDF: conversion_pb2.CONVERSION_PHASE_PPTX_TO_PDF,
PHASE_PDF_TO_IMAGES: conversion_pb2.CONVERSION_PHASE_PDF_TO_IMAGES,
}
self._set_session_progress(
session,
phase=phase_map.get(phase_name, conversion_pb2.CONVERSION_PHASE_INACTIVE),
current_progress=current_progress,
max_progress=max_progress,
)
def _set_session_progress(
self,
session: ConversionSession,
*,
phase: conversion_pb2.ConversionPhase,
current_progress: int,
max_progress: int,
) -> None:
"""Set normalized phase/progress counters and touch update timestamp."""
normalized_max = max(0, max_progress)
normalized_current = max(0, current_progress)
if normalized_max > 0:
normalized_current = min(normalized_current, normalized_max)
session.phase = phase
session.current_progress = normalized_current
session.max_progress = normalized_max
session.updated_at = utc_now()
def _to_timestamp(value: datetime) -> Timestamp:
"""Convert a timezone-aware datetime to protobuf Timestamp."""