mvp implementation

This commit is contained in:
2026-03-26 14:01:10 -07:00
parent 0cde587220
commit ebcf404fde
33 changed files with 3048 additions and 6 deletions
@@ -0,0 +1,16 @@
[project]
name = "officeconvert"
version = "0.1.0"
description = "Core conversion primitives for PPTX to SlideDeck artifacts."
readme = "../../../README.md"
requires-python = ">=3.12"
dependencies = [
"python-pptx>=1.0.2",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["src/officeconvert"]
@@ -0,0 +1,19 @@
"""Public conversion APIs for the officeconvert Python library."""
from officeconvert.conversion import (
SlideArtifact,
SlideDeckResult,
convert_pptx_to_pdf,
convert_pptx_to_slidedeck,
extract_slide_notes,
render_pdf_to_images,
)
__all__ = [
"SlideArtifact",
"SlideDeckResult",
"convert_pptx_to_pdf",
"convert_pptx_to_slidedeck",
"extract_slide_notes",
"render_pdf_to_images",
]
@@ -0,0 +1,225 @@
"""Conversion utilities for transforming PPTX files into slide image artifacts."""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
import subprocess
from typing import Iterable
from pptx import Presentation
@dataclass(frozen=True, slots=True)
class SlideArtifact:
"""Represents one converted slide image and its extracted notes."""
index: int
image_path: Path
notes_plain: str
@dataclass(frozen=True, slots=True)
class SlideDeckResult:
"""Represents all conversion artifacts for a single source presentation."""
source_filename: str
slides: list[SlideArtifact]
def convert_pptx_to_pdf(pptx_path: Path, pdf_path: Path, *, timeout_s: int = 120) -> Path:
"""Convert a PPTX file to PDF using headless LibreOffice.
Args:
pptx_path: Source `.pptx` path.
pdf_path: Destination `.pdf` path.
timeout_s: Maximum process runtime in seconds.
Returns:
The resolved PDF path.
Raises:
FileNotFoundError: If the source PPTX does not exist.
RuntimeError: If LibreOffice fails or does not create expected output.
"""
if not pptx_path.exists():
raise FileNotFoundError(f"source PPTX does not exist: {pptx_path}")
output_dir = pdf_path.parent.resolve()
output_dir.mkdir(parents=True, exist_ok=True)
command = [
"soffice",
"--headless",
"--convert-to",
"pdf",
"--outdir",
str(output_dir),
str(pptx_path.resolve()),
]
completed = subprocess.run(
command,
check=False,
capture_output=True,
text=True,
timeout=timeout_s,
)
if completed.returncode != 0:
raise RuntimeError(
f"LibreOffice conversion failed: {completed.stderr.strip() or completed.stdout.strip()}"
)
generated_pdf = output_dir / f"{pptx_path.stem}.pdf"
if not generated_pdf.exists():
raise RuntimeError(f"LibreOffice did not create expected PDF: {generated_pdf}")
if generated_pdf != pdf_path:
generated_pdf.replace(pdf_path)
return pdf_path.resolve()
def render_pdf_to_images(
pdf_path: Path,
out_dir: Path,
*,
dpi: int = 180,
image_format: str = "png",
timeout_s: int = 120,
) -> list[Path]:
"""Render each PDF page into an image using Poppler's `pdftoppm`.
Args:
pdf_path: Source PDF path.
out_dir: Output directory for rendered images.
dpi: Target rasterization DPI.
image_format: Image format supported by `pdftoppm` (`png`, `jpeg`, ...).
timeout_s: Maximum command runtime in seconds.
Returns:
Ordered list of slide image paths.
Raises:
FileNotFoundError: If the PDF path does not exist.
RuntimeError: If rasterization fails or no output images are produced.
"""
if not pdf_path.exists():
raise FileNotFoundError(f"source PDF does not exist: {pdf_path}")
out_dir.mkdir(parents=True, exist_ok=True)
prefix_path = out_dir / "slide"
command = [
"pdftoppm",
"-r",
str(dpi),
f"-{image_format}",
str(pdf_path.resolve()),
str(prefix_path),
]
completed = subprocess.run(
command,
check=False,
capture_output=True,
text=True,
timeout=timeout_s,
)
if completed.returncode != 0:
raise RuntimeError(
f"Poppler rasterization failed: {completed.stderr.strip() or completed.stdout.strip()}"
)
images = sorted(out_dir.glob(f"slide-*.{image_format}"))
if not images:
raise RuntimeError(f"no rendered images found in {out_dir}")
return [image.resolve() for image in images]
def extract_slide_notes(pptx_path: Path) -> list[str]:
"""Extract plain-text notes for each slide in slide index order.
Args:
pptx_path: Source presentation path.
Returns:
A list of note strings aligned with source slide order.
Raises:
FileNotFoundError: If the source PPTX does not exist.
"""
if not pptx_path.exists():
raise FileNotFoundError(f"source PPTX does not exist: {pptx_path}")
presentation = Presentation(str(pptx_path.resolve()))
notes: list[str] = []
for slide in presentation.slides:
if not slide.has_notes_slide:
notes.append("")
continue
notes.append(_extract_notes_text(slide.notes_slide.shapes))
return notes
def convert_pptx_to_slidedeck(
pptx_path: Path,
work_dir: Path,
*,
dpi: int = 180,
image_format: str = "png",
) -> SlideDeckResult:
"""Convert a PPTX into rendered images and extracted notes.
The pipeline performs PPTX->PDF conversion with LibreOffice and then PDF->images
rendering with Poppler. Notes are extracted from the original PPTX so text
fidelity is preserved independent of rendering output.
Args:
pptx_path: Source `.pptx` path.
work_dir: Scratch directory for generated outputs.
dpi: Rasterization DPI for output slide images.
image_format: Output image format accepted by `pdftoppm`.
Returns:
Fully materialized `SlideDeckResult` with local image paths.
Raises:
ValueError: If rendered page count differs from note count.
"""
work_dir = work_dir.resolve()
work_dir.mkdir(parents=True, exist_ok=True)
pdf_path = work_dir / f"{pptx_path.stem}.pdf"
image_dir = work_dir / "slides"
convert_pptx_to_pdf(pptx_path, pdf_path)
image_paths = render_pdf_to_images(
pdf_path,
image_dir,
dpi=dpi,
image_format=image_format,
)
notes = extract_slide_notes(pptx_path)
if len(image_paths) != len(notes):
raise ValueError(
"rendered slide count does not match note count: "
f"{len(image_paths)} image(s) vs {len(notes)} note entries"
)
slides = [
SlideArtifact(index=index, image_path=image_path, notes_plain=note)
for index, (image_path, note) in enumerate(zip(image_paths, notes), start=1)
]
return SlideDeckResult(source_filename=pptx_path.name, slides=slides)
def _extract_notes_text(shapes: Iterable[object]) -> str:
"""Extract plain text from note shapes while preserving paragraph breaks."""
segments: list[str] = []
for shape in shapes:
text_frame = getattr(shape, "text_frame", None)
if text_frame is None:
continue
# Join paragraph runs because notes often contain formatting splits.
text = "\n".join(paragraph.text for paragraph in text_frame.paragraphs).strip()
if text:
segments.append(text)
return "\n\n".join(segments).strip()