mvp implementation

This commit is contained in:
2026-03-26 14:01:10 -07:00
parent 0cde587220
commit ebcf404fde
33 changed files with 3048 additions and 6 deletions
@@ -0,0 +1,16 @@
[project]
name = "officeconvert"
version = "0.1.0"
description = "Core conversion primitives for PPTX to SlideDeck artifacts."
readme = "../../../README.md"
requires-python = ">=3.12"
dependencies = [
"python-pptx>=1.0.2",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["src/officeconvert"]
@@ -0,0 +1,19 @@
"""Public conversion APIs for the officeconvert Python library."""
from officeconvert.conversion import (
SlideArtifact,
SlideDeckResult,
convert_pptx_to_pdf,
convert_pptx_to_slidedeck,
extract_slide_notes,
render_pdf_to_images,
)
__all__ = [
"SlideArtifact",
"SlideDeckResult",
"convert_pptx_to_pdf",
"convert_pptx_to_slidedeck",
"extract_slide_notes",
"render_pdf_to_images",
]
@@ -0,0 +1,225 @@
"""Conversion utilities for transforming PPTX files into slide image artifacts."""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
import subprocess
from typing import Iterable
from pptx import Presentation
@dataclass(frozen=True, slots=True)
class SlideArtifact:
"""Represents one converted slide image and its extracted notes."""
index: int
image_path: Path
notes_plain: str
@dataclass(frozen=True, slots=True)
class SlideDeckResult:
"""Represents all conversion artifacts for a single source presentation."""
source_filename: str
slides: list[SlideArtifact]
def convert_pptx_to_pdf(pptx_path: Path, pdf_path: Path, *, timeout_s: int = 120) -> Path:
"""Convert a PPTX file to PDF using headless LibreOffice.
Args:
pptx_path: Source `.pptx` path.
pdf_path: Destination `.pdf` path.
timeout_s: Maximum process runtime in seconds.
Returns:
The resolved PDF path.
Raises:
FileNotFoundError: If the source PPTX does not exist.
RuntimeError: If LibreOffice fails or does not create expected output.
"""
if not pptx_path.exists():
raise FileNotFoundError(f"source PPTX does not exist: {pptx_path}")
output_dir = pdf_path.parent.resolve()
output_dir.mkdir(parents=True, exist_ok=True)
command = [
"soffice",
"--headless",
"--convert-to",
"pdf",
"--outdir",
str(output_dir),
str(pptx_path.resolve()),
]
completed = subprocess.run(
command,
check=False,
capture_output=True,
text=True,
timeout=timeout_s,
)
if completed.returncode != 0:
raise RuntimeError(
f"LibreOffice conversion failed: {completed.stderr.strip() or completed.stdout.strip()}"
)
generated_pdf = output_dir / f"{pptx_path.stem}.pdf"
if not generated_pdf.exists():
raise RuntimeError(f"LibreOffice did not create expected PDF: {generated_pdf}")
if generated_pdf != pdf_path:
generated_pdf.replace(pdf_path)
return pdf_path.resolve()
def render_pdf_to_images(
pdf_path: Path,
out_dir: Path,
*,
dpi: int = 180,
image_format: str = "png",
timeout_s: int = 120,
) -> list[Path]:
"""Render each PDF page into an image using Poppler's `pdftoppm`.
Args:
pdf_path: Source PDF path.
out_dir: Output directory for rendered images.
dpi: Target rasterization DPI.
image_format: Image format supported by `pdftoppm` (`png`, `jpeg`, ...).
timeout_s: Maximum command runtime in seconds.
Returns:
Ordered list of slide image paths.
Raises:
FileNotFoundError: If the PDF path does not exist.
RuntimeError: If rasterization fails or no output images are produced.
"""
if not pdf_path.exists():
raise FileNotFoundError(f"source PDF does not exist: {pdf_path}")
out_dir.mkdir(parents=True, exist_ok=True)
prefix_path = out_dir / "slide"
command = [
"pdftoppm",
"-r",
str(dpi),
f"-{image_format}",
str(pdf_path.resolve()),
str(prefix_path),
]
completed = subprocess.run(
command,
check=False,
capture_output=True,
text=True,
timeout=timeout_s,
)
if completed.returncode != 0:
raise RuntimeError(
f"Poppler rasterization failed: {completed.stderr.strip() or completed.stdout.strip()}"
)
images = sorted(out_dir.glob(f"slide-*.{image_format}"))
if not images:
raise RuntimeError(f"no rendered images found in {out_dir}")
return [image.resolve() for image in images]
def extract_slide_notes(pptx_path: Path) -> list[str]:
"""Extract plain-text notes for each slide in slide index order.
Args:
pptx_path: Source presentation path.
Returns:
A list of note strings aligned with source slide order.
Raises:
FileNotFoundError: If the source PPTX does not exist.
"""
if not pptx_path.exists():
raise FileNotFoundError(f"source PPTX does not exist: {pptx_path}")
presentation = Presentation(str(pptx_path.resolve()))
notes: list[str] = []
for slide in presentation.slides:
if not slide.has_notes_slide:
notes.append("")
continue
notes.append(_extract_notes_text(slide.notes_slide.shapes))
return notes
def convert_pptx_to_slidedeck(
pptx_path: Path,
work_dir: Path,
*,
dpi: int = 180,
image_format: str = "png",
) -> SlideDeckResult:
"""Convert a PPTX into rendered images and extracted notes.
The pipeline performs PPTX->PDF conversion with LibreOffice and then PDF->images
rendering with Poppler. Notes are extracted from the original PPTX so text
fidelity is preserved independent of rendering output.
Args:
pptx_path: Source `.pptx` path.
work_dir: Scratch directory for generated outputs.
dpi: Rasterization DPI for output slide images.
image_format: Output image format accepted by `pdftoppm`.
Returns:
Fully materialized `SlideDeckResult` with local image paths.
Raises:
ValueError: If rendered page count differs from note count.
"""
work_dir = work_dir.resolve()
work_dir.mkdir(parents=True, exist_ok=True)
pdf_path = work_dir / f"{pptx_path.stem}.pdf"
image_dir = work_dir / "slides"
convert_pptx_to_pdf(pptx_path, pdf_path)
image_paths = render_pdf_to_images(
pdf_path,
image_dir,
dpi=dpi,
image_format=image_format,
)
notes = extract_slide_notes(pptx_path)
if len(image_paths) != len(notes):
raise ValueError(
"rendered slide count does not match note count: "
f"{len(image_paths)} image(s) vs {len(notes)} note entries"
)
slides = [
SlideArtifact(index=index, image_path=image_path, notes_plain=note)
for index, (image_path, note) in enumerate(zip(image_paths, notes), start=1)
]
return SlideDeckResult(source_filename=pptx_path.name, slides=slides)
def _extract_notes_text(shapes: Iterable[object]) -> str:
"""Extract plain text from note shapes while preserving paragraph breaks."""
segments: list[str] = []
for shape in shapes:
text_frame = getattr(shape, "text_frame", None)
if text_frame is None:
continue
# Join paragraph runs because notes often contain formatting splits.
text = "\n".join(paragraph.text for paragraph in text_frame.paragraphs).strip()
if text:
segments.append(text)
return "\n\n".join(segments).strip()
+19
View File
@@ -0,0 +1,19 @@
[project]
name = "officeconvert-server"
version = "0.1.0"
description = "ConnectRPC server orchestrating file conversions with MinIO."
readme = "../../../README.md"
requires-python = ">=3.12"
dependencies = [
"connectrpc>=0.6.0",
"minio>=7.2.18",
"officeconvert",
"uvicorn>=0.35.0",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["src/officeconvert_server"]
@@ -0,0 +1,13 @@
"""Public exports for the officeconvert server package."""
from officeconvert_server.app import app, create_app
from officeconvert_server.config import ServerConfig, load_server_config
from officeconvert_server.service import ConversionServiceImpl
__all__ = [
"ServerConfig",
"ConversionServiceImpl",
"app",
"create_app",
"load_server_config",
]
@@ -0,0 +1,27 @@
"""ASGI application entrypoint for the officeconvert Connect service."""
from __future__ import annotations
from officeconvertapi.v1.conversion_connect import ConversionServiceASGIApplication
from officeconvert_server.config import load_server_config
from officeconvert_server.service import ConversionServiceImpl
from officeconvert_server.storage import MinIOStore
def create_app() -> ConversionServiceASGIApplication:
"""Construct and return the configured Connect ASGI application."""
config = load_server_config()
store = MinIOStore(
endpoint=config.minio_endpoint,
access_key=config.minio_access_key,
secret_key=config.minio_secret_key,
secure=config.minio_secure,
public_endpoint=config.minio_public_endpoint,
)
service = ConversionServiceImpl(config=config, store=store)
return ConversionServiceASGIApplication(service)
# Exported ASGI application for `uvicorn officeconvert_server.app:app`.
app = create_app()
@@ -0,0 +1,34 @@
"""Runtime configuration for the officeconvert Connect server."""
from __future__ import annotations
from dataclasses import dataclass
import os
@dataclass(frozen=True, slots=True)
class ServerConfig:
"""Defines environment-driven settings for server orchestration."""
minio_endpoint: str
minio_access_key: str
minio_secret_key: str
minio_secure: bool
minio_public_endpoint: str
minio_session_ttl_seconds: int
conversion_cleanup_delay_seconds: int
def load_server_config() -> ServerConfig:
"""Load server configuration from environment variables."""
return ServerConfig(
minio_endpoint=os.getenv("MINIO_ENDPOINT", "localhost:9000"),
minio_access_key=os.getenv("MINIO_ACCESS_KEY", "minioadmin"),
minio_secret_key=os.getenv("MINIO_SECRET_KEY", "minioadmin"),
minio_secure=os.getenv("MINIO_USE_SSL", "false").lower() == "true",
minio_public_endpoint=os.getenv("MINIO_PUBLIC_ENDPOINT", "localhost:9000"),
minio_session_ttl_seconds=int(os.getenv("MINIO_SESSION_TTL_SECONDS", "3600")),
conversion_cleanup_delay_seconds=int(
os.getenv("CONVERSION_CLEANUP_DELAY_SECONDS", "3600")
),
)
@@ -0,0 +1,30 @@
"""In-memory models representing conversion workflow state."""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
def utc_now() -> datetime:
"""Return the current UTC timestamp with timezone information."""
return datetime.now(tz=timezone.utc)
@dataclass(slots=True)
class ConversionSession:
"""Stores mutable state for a single conversion lifecycle."""
conversion_id: str
source_filename: str
bucket_name: str
upload_object_key: str
status: int
created_at: datetime = field(default_factory=utc_now)
updated_at: datetime = field(default_factory=utc_now)
error_message: str = ""
slide_deck: Any | None = None
work_dir: Path | None = None
conversion_task: Any | None = None
cleanup_task: Any | None = None
@@ -0,0 +1,269 @@
"""Connect service implementation for conversion request orchestration."""
from __future__ import annotations
import asyncio
from datetime import datetime, timedelta, timezone
from pathlib import Path
import shutil
import tempfile
import uuid
from connectrpc.code import Code
from connectrpc.errors import ConnectError
from connectrpc.request import RequestContext
from google.protobuf.timestamp_pb2 import Timestamp
from officeconvert import SlideArtifact, convert_pptx_to_slidedeck
from officeconvertapi.v1 import conversion_connect, conversion_pb2
from officeconvert_server.config import ServerConfig
from officeconvert_server.models import ConversionSession, utc_now
from officeconvert_server.storage import MinIOStore
class ConversionServiceImpl(conversion_connect.ConversionService):
"""Implements the conversion API with in-memory state and MinIO orchestration."""
def __init__(self, config: ServerConfig, store: MinIOStore) -> None:
"""Initialize service with runtime config and storage adapter."""
self._config = config
self._store = store
self._sessions: dict[str, ConversionSession] = {}
self._lock = asyncio.Lock()
async def create_conversion(
self,
request: conversion_pb2.CreateConversionRequest,
ctx: RequestContext,
) -> conversion_pb2.CreateConversionResponse:
"""Create a new conversion session and return upload credentials."""
del ctx
source_filename = request.source_filename.strip()
if not source_filename:
raise ConnectError(Code.INVALID_ARGUMENT, "source_filename is required")
if not source_filename.lower().endswith(".pptx"):
raise ConnectError(Code.INVALID_ARGUMENT, "only .pptx input is supported")
conversion_id = str(uuid.uuid4())
bucket_name = f"oc-{conversion_id}"
upload_key = "input/source.pptx"
expires_at = utc_now() + timedelta(seconds=self._config.minio_session_ttl_seconds)
self._store.ensure_bucket(bucket_name)
upload_url = self._store.presigned_put_url(
bucket_name,
upload_key,
ttl_seconds=self._config.minio_session_ttl_seconds,
)
session = ConversionSession(
conversion_id=conversion_id,
source_filename=source_filename,
bucket_name=bucket_name,
upload_object_key=upload_key,
status=conversion_pb2.CONVERSION_STATUS_PENDING,
)
async with self._lock:
self._sessions[conversion_id] = session
return conversion_pb2.CreateConversionResponse(
conversion_id=conversion_id,
upload_bucket=bucket_name,
upload_object_key=upload_key,
upload_url=upload_url,
expires_at=_to_timestamp(expires_at),
)
async def start_conversion(
self,
request: conversion_pb2.StartConversionRequest,
ctx: RequestContext,
) -> conversion_pb2.StartConversionResponse:
"""Start asynchronous conversion for an already-uploaded session payload."""
del ctx
session = await self._get_session(request.conversion_id)
async with self._lock:
if session.status == conversion_pb2.CONVERSION_STATUS_RUNNING:
return conversion_pb2.StartConversionResponse(
conversion_id=session.conversion_id,
status=session.status,
)
if session.status in (
conversion_pb2.CONVERSION_STATUS_FAILED,
conversion_pb2.CONVERSION_STATUS_SUCCEEDED,
):
raise ConnectError(
Code.FAILED_PRECONDITION,
"conversion has already completed",
)
session.status = conversion_pb2.CONVERSION_STATUS_RUNNING
session.updated_at = utc_now()
session.conversion_task = asyncio.create_task(self._run_conversion(session))
return conversion_pb2.StartConversionResponse(
conversion_id=session.conversion_id,
status=session.status,
)
async def get_conversion_status(
self,
request: conversion_pb2.GetConversionStatusRequest,
ctx: RequestContext,
) -> conversion_pb2.GetConversionStatusResponse:
"""Return current conversion status and optional error details."""
del ctx
session = await self._get_session(request.conversion_id)
return conversion_pb2.GetConversionStatusResponse(
conversion_id=session.conversion_id,
status=session.status,
error_message=session.error_message,
updated_at=_to_timestamp(session.updated_at),
)
async def get_slide_deck(
self,
request: conversion_pb2.GetSlideDeckRequest,
ctx: RequestContext,
) -> conversion_pb2.GetSlideDeckResponse:
"""Return the finished slide deck once conversion succeeds."""
del ctx
session = await self._get_session(request.conversion_id)
if session.status == conversion_pb2.CONVERSION_STATUS_FAILED:
raise ConnectError(Code.FAILED_PRECONDITION, session.error_message)
if session.status != conversion_pb2.CONVERSION_STATUS_SUCCEEDED:
raise ConnectError(Code.FAILED_PRECONDITION, "conversion is not finished yet")
if session.slide_deck is None:
raise ConnectError(Code.INTERNAL, "slide deck missing from successful session")
return conversion_pb2.GetSlideDeckResponse(slide_deck=session.slide_deck)
async def delete_conversion(
self,
request: conversion_pb2.DeleteConversionRequest,
ctx: RequestContext,
) -> conversion_pb2.DeleteConversionResponse:
"""Delete a conversion session and associated MinIO/local artifacts."""
del ctx
async with self._lock:
session = self._sessions.pop(request.conversion_id, None)
if session is None:
return conversion_pb2.DeleteConversionResponse(
conversion_id=request.conversion_id,
deleted=False,
)
if session.cleanup_task is not None:
session.cleanup_task.cancel()
if session.conversion_task is not None and not session.conversion_task.done():
session.conversion_task.cancel()
await self._cleanup_local_artifacts(session)
await asyncio.to_thread(self._store.remove_bucket_tree, session.bucket_name)
return conversion_pb2.DeleteConversionResponse(
conversion_id=session.conversion_id,
deleted=True,
)
async def _run_conversion(self, session: ConversionSession) -> None:
"""Execute conversion flow and persist terminal state in memory."""
work_dir = Path(
tempfile.mkdtemp(prefix=f"officeconvert-{session.conversion_id}-")
).resolve()
session.work_dir = work_dir
source_path = work_dir / "input.pptx"
try:
await asyncio.to_thread(
self._store.fget_object,
session.bucket_name,
session.upload_object_key,
source_path,
)
result = await asyncio.to_thread(
convert_pptx_to_slidedeck,
source_path,
work_dir,
)
session.slide_deck = await asyncio.to_thread(
self._upload_and_build_slide_deck,
session,
result.slides,
result.source_filename,
)
session.status = conversion_pb2.CONVERSION_STATUS_SUCCEEDED
session.updated_at = utc_now()
except asyncio.CancelledError:
session.status = conversion_pb2.CONVERSION_STATUS_FAILED
session.error_message = "conversion cancelled"
session.updated_at = utc_now()
raise
except Exception as exc:
session.status = conversion_pb2.CONVERSION_STATUS_FAILED
session.error_message = str(exc)
session.updated_at = utc_now()
finally:
await self._cleanup_local_artifacts(session)
session.cleanup_task = asyncio.create_task(self._delayed_cleanup(session))
def _upload_and_build_slide_deck(
self,
session: ConversionSession,
slides: list[SlideArtifact],
source_filename: str,
) -> conversion_pb2.SlideDeck:
"""Upload generated slide images and construct API response payload."""
response_slides: list[conversion_pb2.Slide] = []
for slide in slides:
object_key = f"output/slide-{slide.index:04d}{slide.image_path.suffix}"
self._store.fput_object(session.bucket_name, object_key, slide.image_path)
image_url = self._store.presigned_get_url(
session.bucket_name,
object_key,
ttl_seconds=self._config.minio_session_ttl_seconds,
)
response_slides.append(
conversion_pb2.Slide(
index=slide.index,
notes_plain=slide.notes_plain,
image_url=image_url,
)
)
return conversion_pb2.SlideDeck(
conversion_id=session.conversion_id,
source_filename=source_filename,
slides=response_slides,
created_at=_to_timestamp(utc_now()),
)
async def _delayed_cleanup(self, session: ConversionSession) -> None:
"""Delete storage resources after the configured session retention period."""
try:
await asyncio.sleep(self._config.conversion_cleanup_delay_seconds)
await asyncio.to_thread(self._store.remove_bucket_tree, session.bucket_name)
except asyncio.CancelledError:
return
finally:
async with self._lock:
self._sessions.pop(session.conversion_id, None)
async def _cleanup_local_artifacts(self, session: ConversionSession) -> None:
"""Delete temporary local files for a session if they still exist."""
if session.work_dir is not None and session.work_dir.exists():
await asyncio.to_thread(shutil.rmtree, session.work_dir, True)
session.work_dir = None
async def _get_session(self, conversion_id: str) -> ConversionSession:
"""Return an existing session or raise a NOT_FOUND error."""
async with self._lock:
session = self._sessions.get(conversion_id)
if session is None:
raise ConnectError(Code.NOT_FOUND, "conversion_id not found")
return session
def _to_timestamp(value: datetime) -> Timestamp:
"""Convert a timezone-aware datetime to protobuf Timestamp."""
normalized = value.astimezone(timezone.utc)
proto = Timestamp()
proto.FromDatetime(normalized)
return proto
@@ -0,0 +1,94 @@
"""MinIO helper abstraction for upload and artifact lifecycle."""
from __future__ import annotations
from datetime import timedelta
from pathlib import Path
from urllib.parse import urlparse
from minio import Minio
from minio.deleteobjects import DeleteObject
from minio.error import S3Error
class MinIOStore:
"""Provides typed helper methods around MinIO object storage operations."""
def __init__(
self,
*,
endpoint: str,
access_key: str,
secret_key: str,
secure: bool,
public_endpoint: str,
) -> None:
"""Initialize MinIO clients for internal and public URL generation."""
self._client = Minio(
endpoint,
access_key=access_key,
secret_key=secret_key,
secure=secure,
)
self._public_client = Minio(
public_endpoint,
access_key=access_key,
secret_key=secret_key,
secure=secure,
)
def ensure_bucket(self, bucket_name: str) -> None:
"""Create a bucket if it does not already exist."""
if not self._client.bucket_exists(bucket_name):
self._client.make_bucket(bucket_name)
def presigned_put_url(self, bucket_name: str, object_key: str, *, ttl_seconds: int) -> str:
"""Generate a presigned PUT URL for a single object upload."""
return self._public_client.presigned_put_object(
bucket_name,
object_key,
expires=timedelta(seconds=ttl_seconds),
)
def presigned_get_url(self, bucket_name: str, object_key: str, *, ttl_seconds: int) -> str:
"""Generate a presigned GET URL for downloading one object."""
return self._public_client.presigned_get_object(
bucket_name,
object_key,
expires=timedelta(seconds=ttl_seconds),
)
def fget_object(self, bucket_name: str, object_key: str, output_path: Path) -> None:
"""Download one object from MinIO to a local filesystem path."""
output_path.parent.mkdir(parents=True, exist_ok=True)
self._client.fget_object(bucket_name, object_key, str(output_path))
def fput_object(self, bucket_name: str, object_key: str, source_path: Path) -> None:
"""Upload one local filesystem object to MinIO."""
self._client.fput_object(bucket_name, object_key, str(source_path))
def remove_bucket_tree(self, bucket_name: str) -> None:
"""Remove all objects in a bucket and then delete the bucket."""
objects = list(self._client.list_objects(bucket_name, recursive=True))
if objects:
errors = self._client.remove_objects(
bucket_name,
[DeleteObject(obj.object_name) for obj in objects],
)
for err in errors:
raise RuntimeError(
f"failed to delete object {err.object_name}: {err.message}"
)
try:
self._client.remove_bucket(bucket_name)
except S3Error as exc:
# Concurrent cleanup paths may race to remove the same bucket.
if exc.code != "NoSuchBucket":
raise
def object_key_from_presigned_url(url: str) -> str:
"""Extract object key from a presigned URL path for diagnostics."""
path = urlparse(url).path
path_parts = [part for part in path.split("/") if part]
return "/".join(path_parts[1:]) if len(path_parts) >= 2 else ""
+11
View File
@@ -0,0 +1,11 @@
[project]
name = "officeconvert-workspace"
version = "0.1.0"
description = "Workspace root for officeconvert Python packages."
requires-python = ">=3.12"
[tool.uv.workspace]
members = [
"packages/officeconvert",
"packages/server",
]