mvp implementation

2026-03-26 14:01:10 -07:00
parent 0cde587220
commit ebcf404fde
33 changed files with 3048 additions and 6 deletions
@@ -0,0 +1,16 @@
+[project]
+name = "officeconvert"
+version = "0.1.0"
+description = "Core conversion primitives for PPTX to SlideDeck artifacts."
+readme = "../../../README.md"
+requires-python = ">=3.12"
+dependencies = [
+  "python-pptx>=1.0.2",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/officeconvert"]
@@ -0,0 +1,19 @@
+"""Public conversion APIs for the officeconvert Python library."""
+
+from officeconvert.conversion import (
+    SlideArtifact,
+    SlideDeckResult,
+    convert_pptx_to_pdf,
+    convert_pptx_to_slidedeck,
+    extract_slide_notes,
+    render_pdf_to_images,
+)
+
+__all__ = [
+    "SlideArtifact",
+    "SlideDeckResult",
+    "convert_pptx_to_pdf",
+    "convert_pptx_to_slidedeck",
+    "extract_slide_notes",
+    "render_pdf_to_images",
+]
@@ -0,0 +1,225 @@
+"""Conversion utilities for transforming PPTX files into slide image artifacts."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+import subprocess
+from typing import Iterable
+
+from pptx import Presentation
+
+
+@dataclass(frozen=True, slots=True)
+class SlideArtifact:
+    """Represents one converted slide image and its extracted notes."""
+
+    index: int
+    image_path: Path
+    notes_plain: str
+
+
+@dataclass(frozen=True, slots=True)
+class SlideDeckResult:
+    """Represents all conversion artifacts for a single source presentation."""
+
+    source_filename: str
+    slides: list[SlideArtifact]
+
+
+def convert_pptx_to_pdf(pptx_path: Path, pdf_path: Path, *, timeout_s: int = 120) -> Path:
+    """Convert a PPTX file to PDF using headless LibreOffice.
+
+    Args:
+        pptx_path: Source `.pptx` path.
+        pdf_path: Destination `.pdf` path.
+        timeout_s: Maximum process runtime in seconds.
+
+    Returns:
+        The resolved PDF path.
+
+    Raises:
+        FileNotFoundError: If the source PPTX does not exist.
+        RuntimeError: If LibreOffice fails or does not create expected output.
+    """
+    if not pptx_path.exists():
+        raise FileNotFoundError(f"source PPTX does not exist: {pptx_path}")
+
+    output_dir = pdf_path.parent.resolve()
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    command = [
+        "soffice",
+        "--headless",
+        "--convert-to",
+        "pdf",
+        "--outdir",
+        str(output_dir),
+        str(pptx_path.resolve()),
+    ]
+    completed = subprocess.run(
+        command,
+        check=False,
+        capture_output=True,
+        text=True,
+        timeout=timeout_s,
+    )
+    if completed.returncode != 0:
+        raise RuntimeError(
+            f"LibreOffice conversion failed: {completed.stderr.strip() or completed.stdout.strip()}"
+        )
+
+    generated_pdf = output_dir / f"{pptx_path.stem}.pdf"
+    if not generated_pdf.exists():
+        raise RuntimeError(f"LibreOffice did not create expected PDF: {generated_pdf}")
+
+    if generated_pdf != pdf_path:
+        generated_pdf.replace(pdf_path)
+
+    return pdf_path.resolve()
+
+
+def render_pdf_to_images(
+    pdf_path: Path,
+    out_dir: Path,
+    *,
+    dpi: int = 180,
+    image_format: str = "png",
+    timeout_s: int = 120,
+) -> list[Path]:
+    """Render each PDF page into an image using Poppler's `pdftoppm`.
+
+    Args:
+        pdf_path: Source PDF path.
+        out_dir: Output directory for rendered images.
+        dpi: Target rasterization DPI.
+        image_format: Image format supported by `pdftoppm` (`png`, `jpeg`, ...).
+        timeout_s: Maximum command runtime in seconds.
+
+    Returns:
+        Ordered list of slide image paths.
+
+    Raises:
+        FileNotFoundError: If the PDF path does not exist.
+        RuntimeError: If rasterization fails or no output images are produced.
+    """
+    if not pdf_path.exists():
+        raise FileNotFoundError(f"source PDF does not exist: {pdf_path}")
+
+    out_dir.mkdir(parents=True, exist_ok=True)
+    prefix_path = out_dir / "slide"
+    command = [
+        "pdftoppm",
+        "-r",
+        str(dpi),
+        f"-{image_format}",
+        str(pdf_path.resolve()),
+        str(prefix_path),
+    ]
+    completed = subprocess.run(
+        command,
+        check=False,
+        capture_output=True,
+        text=True,
+        timeout=timeout_s,
+    )
+    if completed.returncode != 0:
+        raise RuntimeError(
+            f"Poppler rasterization failed: {completed.stderr.strip() or completed.stdout.strip()}"
+        )
+
+    images = sorted(out_dir.glob(f"slide-*.{image_format}"))
+    if not images:
+        raise RuntimeError(f"no rendered images found in {out_dir}")
+    return [image.resolve() for image in images]
+
+
+def extract_slide_notes(pptx_path: Path) -> list[str]:
+    """Extract plain-text notes for each slide in slide index order.
+
+    Args:
+        pptx_path: Source presentation path.
+
+    Returns:
+        A list of note strings aligned with source slide order.
+
+    Raises:
+        FileNotFoundError: If the source PPTX does not exist.
+    """
+    if not pptx_path.exists():
+        raise FileNotFoundError(f"source PPTX does not exist: {pptx_path}")
+
+    presentation = Presentation(str(pptx_path.resolve()))
+    notes: list[str] = []
+    for slide in presentation.slides:
+        if not slide.has_notes_slide:
+            notes.append("")
+            continue
+        notes.append(_extract_notes_text(slide.notes_slide.shapes))
+    return notes
+
+
+def convert_pptx_to_slidedeck(
+    pptx_path: Path,
+    work_dir: Path,
+    *,
+    dpi: int = 180,
+    image_format: str = "png",
+) -> SlideDeckResult:
+    """Convert a PPTX into rendered images and extracted notes.
+
+    The pipeline performs PPTX->PDF conversion with LibreOffice and then PDF->images
+    rendering with Poppler. Notes are extracted from the original PPTX so text
+    fidelity is preserved independent of rendering output.
+
+    Args:
+        pptx_path: Source `.pptx` path.
+        work_dir: Scratch directory for generated outputs.
+        dpi: Rasterization DPI for output slide images.
+        image_format: Output image format accepted by `pdftoppm`.
+
+    Returns:
+        Fully materialized `SlideDeckResult` with local image paths.
+
+    Raises:
+        ValueError: If rendered page count differs from note count.
+    """
+    work_dir = work_dir.resolve()
+    work_dir.mkdir(parents=True, exist_ok=True)
+    pdf_path = work_dir / f"{pptx_path.stem}.pdf"
+    image_dir = work_dir / "slides"
+
+    convert_pptx_to_pdf(pptx_path, pdf_path)
+    image_paths = render_pdf_to_images(
+        pdf_path,
+        image_dir,
+        dpi=dpi,
+        image_format=image_format,
+    )
+    notes = extract_slide_notes(pptx_path)
+
+    if len(image_paths) != len(notes):
+        raise ValueError(
+            "rendered slide count does not match note count: "
+            f"{len(image_paths)} image(s) vs {len(notes)} note entries"
+        )
+
+    slides = [
+        SlideArtifact(index=index, image_path=image_path, notes_plain=note)
+        for index, (image_path, note) in enumerate(zip(image_paths, notes), start=1)
+    ]
+    return SlideDeckResult(source_filename=pptx_path.name, slides=slides)
+
+
+def _extract_notes_text(shapes: Iterable[object]) -> str:
+    """Extract plain text from note shapes while preserving paragraph breaks."""
+    segments: list[str] = []
+    for shape in shapes:
+        text_frame = getattr(shape, "text_frame", None)
+        if text_frame is None:
+            continue
+        # Join paragraph runs because notes often contain formatting splits.
+        text = "\n".join(paragraph.text for paragraph in text_frame.paragraphs).strip()
+        if text:
+            segments.append(text)
+    return "\n\n".join(segments).strip()