mvp implementation
This commit is contained in:
@@ -0,0 +1,16 @@
|
||||
[project]
|
||||
name = "officeconvert"
|
||||
version = "0.1.0"
|
||||
description = "Core conversion primitives for PPTX to SlideDeck artifacts."
|
||||
readme = "../../../README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"python-pptx>=1.0.2",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/officeconvert"]
|
||||
@@ -0,0 +1,19 @@
|
||||
"""Public conversion APIs for the officeconvert Python library."""
|
||||
|
||||
from officeconvert.conversion import (
|
||||
SlideArtifact,
|
||||
SlideDeckResult,
|
||||
convert_pptx_to_pdf,
|
||||
convert_pptx_to_slidedeck,
|
||||
extract_slide_notes,
|
||||
render_pdf_to_images,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"SlideArtifact",
|
||||
"SlideDeckResult",
|
||||
"convert_pptx_to_pdf",
|
||||
"convert_pptx_to_slidedeck",
|
||||
"extract_slide_notes",
|
||||
"render_pdf_to_images",
|
||||
]
|
||||
@@ -0,0 +1,225 @@
|
||||
"""Conversion utilities for transforming PPTX files into slide image artifacts."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
from typing import Iterable
|
||||
|
||||
from pptx import Presentation
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class SlideArtifact:
|
||||
"""Represents one converted slide image and its extracted notes."""
|
||||
|
||||
index: int
|
||||
image_path: Path
|
||||
notes_plain: str
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class SlideDeckResult:
|
||||
"""Represents all conversion artifacts for a single source presentation."""
|
||||
|
||||
source_filename: str
|
||||
slides: list[SlideArtifact]
|
||||
|
||||
|
||||
def convert_pptx_to_pdf(pptx_path: Path, pdf_path: Path, *, timeout_s: int = 120) -> Path:
|
||||
"""Convert a PPTX file to PDF using headless LibreOffice.
|
||||
|
||||
Args:
|
||||
pptx_path: Source `.pptx` path.
|
||||
pdf_path: Destination `.pdf` path.
|
||||
timeout_s: Maximum process runtime in seconds.
|
||||
|
||||
Returns:
|
||||
The resolved PDF path.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the source PPTX does not exist.
|
||||
RuntimeError: If LibreOffice fails or does not create expected output.
|
||||
"""
|
||||
if not pptx_path.exists():
|
||||
raise FileNotFoundError(f"source PPTX does not exist: {pptx_path}")
|
||||
|
||||
output_dir = pdf_path.parent.resolve()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
command = [
|
||||
"soffice",
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
"pdf",
|
||||
"--outdir",
|
||||
str(output_dir),
|
||||
str(pptx_path.resolve()),
|
||||
]
|
||||
completed = subprocess.run(
|
||||
command,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout_s,
|
||||
)
|
||||
if completed.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"LibreOffice conversion failed: {completed.stderr.strip() or completed.stdout.strip()}"
|
||||
)
|
||||
|
||||
generated_pdf = output_dir / f"{pptx_path.stem}.pdf"
|
||||
if not generated_pdf.exists():
|
||||
raise RuntimeError(f"LibreOffice did not create expected PDF: {generated_pdf}")
|
||||
|
||||
if generated_pdf != pdf_path:
|
||||
generated_pdf.replace(pdf_path)
|
||||
|
||||
return pdf_path.resolve()
|
||||
|
||||
|
||||
def render_pdf_to_images(
|
||||
pdf_path: Path,
|
||||
out_dir: Path,
|
||||
*,
|
||||
dpi: int = 180,
|
||||
image_format: str = "png",
|
||||
timeout_s: int = 120,
|
||||
) -> list[Path]:
|
||||
"""Render each PDF page into an image using Poppler's `pdftoppm`.
|
||||
|
||||
Args:
|
||||
pdf_path: Source PDF path.
|
||||
out_dir: Output directory for rendered images.
|
||||
dpi: Target rasterization DPI.
|
||||
image_format: Image format supported by `pdftoppm` (`png`, `jpeg`, ...).
|
||||
timeout_s: Maximum command runtime in seconds.
|
||||
|
||||
Returns:
|
||||
Ordered list of slide image paths.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the PDF path does not exist.
|
||||
RuntimeError: If rasterization fails or no output images are produced.
|
||||
"""
|
||||
if not pdf_path.exists():
|
||||
raise FileNotFoundError(f"source PDF does not exist: {pdf_path}")
|
||||
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
prefix_path = out_dir / "slide"
|
||||
command = [
|
||||
"pdftoppm",
|
||||
"-r",
|
||||
str(dpi),
|
||||
f"-{image_format}",
|
||||
str(pdf_path.resolve()),
|
||||
str(prefix_path),
|
||||
]
|
||||
completed = subprocess.run(
|
||||
command,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout_s,
|
||||
)
|
||||
if completed.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"Poppler rasterization failed: {completed.stderr.strip() or completed.stdout.strip()}"
|
||||
)
|
||||
|
||||
images = sorted(out_dir.glob(f"slide-*.{image_format}"))
|
||||
if not images:
|
||||
raise RuntimeError(f"no rendered images found in {out_dir}")
|
||||
return [image.resolve() for image in images]
|
||||
|
||||
|
||||
def extract_slide_notes(pptx_path: Path) -> list[str]:
|
||||
"""Extract plain-text notes for each slide in slide index order.
|
||||
|
||||
Args:
|
||||
pptx_path: Source presentation path.
|
||||
|
||||
Returns:
|
||||
A list of note strings aligned with source slide order.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the source PPTX does not exist.
|
||||
"""
|
||||
if not pptx_path.exists():
|
||||
raise FileNotFoundError(f"source PPTX does not exist: {pptx_path}")
|
||||
|
||||
presentation = Presentation(str(pptx_path.resolve()))
|
||||
notes: list[str] = []
|
||||
for slide in presentation.slides:
|
||||
if not slide.has_notes_slide:
|
||||
notes.append("")
|
||||
continue
|
||||
notes.append(_extract_notes_text(slide.notes_slide.shapes))
|
||||
return notes
|
||||
|
||||
|
||||
def convert_pptx_to_slidedeck(
|
||||
pptx_path: Path,
|
||||
work_dir: Path,
|
||||
*,
|
||||
dpi: int = 180,
|
||||
image_format: str = "png",
|
||||
) -> SlideDeckResult:
|
||||
"""Convert a PPTX into rendered images and extracted notes.
|
||||
|
||||
The pipeline performs PPTX->PDF conversion with LibreOffice and then PDF->images
|
||||
rendering with Poppler. Notes are extracted from the original PPTX so text
|
||||
fidelity is preserved independent of rendering output.
|
||||
|
||||
Args:
|
||||
pptx_path: Source `.pptx` path.
|
||||
work_dir: Scratch directory for generated outputs.
|
||||
dpi: Rasterization DPI for output slide images.
|
||||
image_format: Output image format accepted by `pdftoppm`.
|
||||
|
||||
Returns:
|
||||
Fully materialized `SlideDeckResult` with local image paths.
|
||||
|
||||
Raises:
|
||||
ValueError: If rendered page count differs from note count.
|
||||
"""
|
||||
work_dir = work_dir.resolve()
|
||||
work_dir.mkdir(parents=True, exist_ok=True)
|
||||
pdf_path = work_dir / f"{pptx_path.stem}.pdf"
|
||||
image_dir = work_dir / "slides"
|
||||
|
||||
convert_pptx_to_pdf(pptx_path, pdf_path)
|
||||
image_paths = render_pdf_to_images(
|
||||
pdf_path,
|
||||
image_dir,
|
||||
dpi=dpi,
|
||||
image_format=image_format,
|
||||
)
|
||||
notes = extract_slide_notes(pptx_path)
|
||||
|
||||
if len(image_paths) != len(notes):
|
||||
raise ValueError(
|
||||
"rendered slide count does not match note count: "
|
||||
f"{len(image_paths)} image(s) vs {len(notes)} note entries"
|
||||
)
|
||||
|
||||
slides = [
|
||||
SlideArtifact(index=index, image_path=image_path, notes_plain=note)
|
||||
for index, (image_path, note) in enumerate(zip(image_paths, notes), start=1)
|
||||
]
|
||||
return SlideDeckResult(source_filename=pptx_path.name, slides=slides)
|
||||
|
||||
|
||||
def _extract_notes_text(shapes: Iterable[object]) -> str:
|
||||
"""Extract plain text from note shapes while preserving paragraph breaks."""
|
||||
segments: list[str] = []
|
||||
for shape in shapes:
|
||||
text_frame = getattr(shape, "text_frame", None)
|
||||
if text_frame is None:
|
||||
continue
|
||||
# Join paragraph runs because notes often contain formatting splits.
|
||||
text = "\n".join(paragraph.text for paragraph in text_frame.paragraphs).strip()
|
||||
if text:
|
||||
segments.append(text)
|
||||
return "\n\n".join(segments).strip()
|
||||
Reference in New Issue
Block a user