mvp implementation
This commit is contained in:
@@ -0,0 +1,16 @@
|
||||
[project]
|
||||
name = "officeconvert"
|
||||
version = "0.1.0"
|
||||
description = "Core conversion primitives for PPTX to SlideDeck artifacts."
|
||||
readme = "../../../README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"python-pptx>=1.0.2",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/officeconvert"]
|
||||
@@ -0,0 +1,19 @@
|
||||
"""Public conversion APIs for the officeconvert Python library."""
|
||||
|
||||
from officeconvert.conversion import (
|
||||
SlideArtifact,
|
||||
SlideDeckResult,
|
||||
convert_pptx_to_pdf,
|
||||
convert_pptx_to_slidedeck,
|
||||
extract_slide_notes,
|
||||
render_pdf_to_images,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"SlideArtifact",
|
||||
"SlideDeckResult",
|
||||
"convert_pptx_to_pdf",
|
||||
"convert_pptx_to_slidedeck",
|
||||
"extract_slide_notes",
|
||||
"render_pdf_to_images",
|
||||
]
|
||||
@@ -0,0 +1,225 @@
|
||||
"""Conversion utilities for transforming PPTX files into slide image artifacts."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
from typing import Iterable
|
||||
|
||||
from pptx import Presentation
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class SlideArtifact:
|
||||
"""Represents one converted slide image and its extracted notes."""
|
||||
|
||||
index: int
|
||||
image_path: Path
|
||||
notes_plain: str
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class SlideDeckResult:
|
||||
"""Represents all conversion artifacts for a single source presentation."""
|
||||
|
||||
source_filename: str
|
||||
slides: list[SlideArtifact]
|
||||
|
||||
|
||||
def convert_pptx_to_pdf(pptx_path: Path, pdf_path: Path, *, timeout_s: int = 120) -> Path:
|
||||
"""Convert a PPTX file to PDF using headless LibreOffice.
|
||||
|
||||
Args:
|
||||
pptx_path: Source `.pptx` path.
|
||||
pdf_path: Destination `.pdf` path.
|
||||
timeout_s: Maximum process runtime in seconds.
|
||||
|
||||
Returns:
|
||||
The resolved PDF path.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the source PPTX does not exist.
|
||||
RuntimeError: If LibreOffice fails or does not create expected output.
|
||||
"""
|
||||
if not pptx_path.exists():
|
||||
raise FileNotFoundError(f"source PPTX does not exist: {pptx_path}")
|
||||
|
||||
output_dir = pdf_path.parent.resolve()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
command = [
|
||||
"soffice",
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
"pdf",
|
||||
"--outdir",
|
||||
str(output_dir),
|
||||
str(pptx_path.resolve()),
|
||||
]
|
||||
completed = subprocess.run(
|
||||
command,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout_s,
|
||||
)
|
||||
if completed.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"LibreOffice conversion failed: {completed.stderr.strip() or completed.stdout.strip()}"
|
||||
)
|
||||
|
||||
generated_pdf = output_dir / f"{pptx_path.stem}.pdf"
|
||||
if not generated_pdf.exists():
|
||||
raise RuntimeError(f"LibreOffice did not create expected PDF: {generated_pdf}")
|
||||
|
||||
if generated_pdf != pdf_path:
|
||||
generated_pdf.replace(pdf_path)
|
||||
|
||||
return pdf_path.resolve()
|
||||
|
||||
|
||||
def render_pdf_to_images(
|
||||
pdf_path: Path,
|
||||
out_dir: Path,
|
||||
*,
|
||||
dpi: int = 180,
|
||||
image_format: str = "png",
|
||||
timeout_s: int = 120,
|
||||
) -> list[Path]:
|
||||
"""Render each PDF page into an image using Poppler's `pdftoppm`.
|
||||
|
||||
Args:
|
||||
pdf_path: Source PDF path.
|
||||
out_dir: Output directory for rendered images.
|
||||
dpi: Target rasterization DPI.
|
||||
image_format: Image format supported by `pdftoppm` (`png`, `jpeg`, ...).
|
||||
timeout_s: Maximum command runtime in seconds.
|
||||
|
||||
Returns:
|
||||
Ordered list of slide image paths.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the PDF path does not exist.
|
||||
RuntimeError: If rasterization fails or no output images are produced.
|
||||
"""
|
||||
if not pdf_path.exists():
|
||||
raise FileNotFoundError(f"source PDF does not exist: {pdf_path}")
|
||||
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
prefix_path = out_dir / "slide"
|
||||
command = [
|
||||
"pdftoppm",
|
||||
"-r",
|
||||
str(dpi),
|
||||
f"-{image_format}",
|
||||
str(pdf_path.resolve()),
|
||||
str(prefix_path),
|
||||
]
|
||||
completed = subprocess.run(
|
||||
command,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout_s,
|
||||
)
|
||||
if completed.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"Poppler rasterization failed: {completed.stderr.strip() or completed.stdout.strip()}"
|
||||
)
|
||||
|
||||
images = sorted(out_dir.glob(f"slide-*.{image_format}"))
|
||||
if not images:
|
||||
raise RuntimeError(f"no rendered images found in {out_dir}")
|
||||
return [image.resolve() for image in images]
|
||||
|
||||
|
||||
def extract_slide_notes(pptx_path: Path) -> list[str]:
|
||||
"""Extract plain-text notes for each slide in slide index order.
|
||||
|
||||
Args:
|
||||
pptx_path: Source presentation path.
|
||||
|
||||
Returns:
|
||||
A list of note strings aligned with source slide order.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the source PPTX does not exist.
|
||||
"""
|
||||
if not pptx_path.exists():
|
||||
raise FileNotFoundError(f"source PPTX does not exist: {pptx_path}")
|
||||
|
||||
presentation = Presentation(str(pptx_path.resolve()))
|
||||
notes: list[str] = []
|
||||
for slide in presentation.slides:
|
||||
if not slide.has_notes_slide:
|
||||
notes.append("")
|
||||
continue
|
||||
notes.append(_extract_notes_text(slide.notes_slide.shapes))
|
||||
return notes
|
||||
|
||||
|
||||
def convert_pptx_to_slidedeck(
|
||||
pptx_path: Path,
|
||||
work_dir: Path,
|
||||
*,
|
||||
dpi: int = 180,
|
||||
image_format: str = "png",
|
||||
) -> SlideDeckResult:
|
||||
"""Convert a PPTX into rendered images and extracted notes.
|
||||
|
||||
The pipeline performs PPTX->PDF conversion with LibreOffice and then PDF->images
|
||||
rendering with Poppler. Notes are extracted from the original PPTX so text
|
||||
fidelity is preserved independent of rendering output.
|
||||
|
||||
Args:
|
||||
pptx_path: Source `.pptx` path.
|
||||
work_dir: Scratch directory for generated outputs.
|
||||
dpi: Rasterization DPI for output slide images.
|
||||
image_format: Output image format accepted by `pdftoppm`.
|
||||
|
||||
Returns:
|
||||
Fully materialized `SlideDeckResult` with local image paths.
|
||||
|
||||
Raises:
|
||||
ValueError: If rendered page count differs from note count.
|
||||
"""
|
||||
work_dir = work_dir.resolve()
|
||||
work_dir.mkdir(parents=True, exist_ok=True)
|
||||
pdf_path = work_dir / f"{pptx_path.stem}.pdf"
|
||||
image_dir = work_dir / "slides"
|
||||
|
||||
convert_pptx_to_pdf(pptx_path, pdf_path)
|
||||
image_paths = render_pdf_to_images(
|
||||
pdf_path,
|
||||
image_dir,
|
||||
dpi=dpi,
|
||||
image_format=image_format,
|
||||
)
|
||||
notes = extract_slide_notes(pptx_path)
|
||||
|
||||
if len(image_paths) != len(notes):
|
||||
raise ValueError(
|
||||
"rendered slide count does not match note count: "
|
||||
f"{len(image_paths)} image(s) vs {len(notes)} note entries"
|
||||
)
|
||||
|
||||
slides = [
|
||||
SlideArtifact(index=index, image_path=image_path, notes_plain=note)
|
||||
for index, (image_path, note) in enumerate(zip(image_paths, notes), start=1)
|
||||
]
|
||||
return SlideDeckResult(source_filename=pptx_path.name, slides=slides)
|
||||
|
||||
|
||||
def _extract_notes_text(shapes: Iterable[object]) -> str:
|
||||
"""Extract plain text from note shapes while preserving paragraph breaks."""
|
||||
segments: list[str] = []
|
||||
for shape in shapes:
|
||||
text_frame = getattr(shape, "text_frame", None)
|
||||
if text_frame is None:
|
||||
continue
|
||||
# Join paragraph runs because notes often contain formatting splits.
|
||||
text = "\n".join(paragraph.text for paragraph in text_frame.paragraphs).strip()
|
||||
if text:
|
||||
segments.append(text)
|
||||
return "\n\n".join(segments).strip()
|
||||
@@ -0,0 +1,19 @@
|
||||
[project]
|
||||
name = "officeconvert-server"
|
||||
version = "0.1.0"
|
||||
description = "ConnectRPC server orchestrating file conversions with MinIO."
|
||||
readme = "../../../README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"connectrpc>=0.6.0",
|
||||
"minio>=7.2.18",
|
||||
"officeconvert",
|
||||
"uvicorn>=0.35.0",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/officeconvert_server"]
|
||||
@@ -0,0 +1,13 @@
|
||||
"""Public exports for the officeconvert server package."""
|
||||
|
||||
from officeconvert_server.app import app, create_app
|
||||
from officeconvert_server.config import ServerConfig, load_server_config
|
||||
from officeconvert_server.service import ConversionServiceImpl
|
||||
|
||||
__all__ = [
|
||||
"ServerConfig",
|
||||
"ConversionServiceImpl",
|
||||
"app",
|
||||
"create_app",
|
||||
"load_server_config",
|
||||
]
|
||||
@@ -0,0 +1,27 @@
|
||||
"""ASGI application entrypoint for the officeconvert Connect service."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from officeconvertapi.v1.conversion_connect import ConversionServiceASGIApplication
|
||||
|
||||
from officeconvert_server.config import load_server_config
|
||||
from officeconvert_server.service import ConversionServiceImpl
|
||||
from officeconvert_server.storage import MinIOStore
|
||||
|
||||
|
||||
def create_app() -> ConversionServiceASGIApplication:
|
||||
"""Construct and return the configured Connect ASGI application."""
|
||||
config = load_server_config()
|
||||
store = MinIOStore(
|
||||
endpoint=config.minio_endpoint,
|
||||
access_key=config.minio_access_key,
|
||||
secret_key=config.minio_secret_key,
|
||||
secure=config.minio_secure,
|
||||
public_endpoint=config.minio_public_endpoint,
|
||||
)
|
||||
service = ConversionServiceImpl(config=config, store=store)
|
||||
return ConversionServiceASGIApplication(service)
|
||||
|
||||
|
||||
# Exported ASGI application for `uvicorn officeconvert_server.app:app`.
|
||||
app = create_app()
|
||||
@@ -0,0 +1,34 @@
|
||||
"""Runtime configuration for the officeconvert Connect server."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
import os
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class ServerConfig:
|
||||
"""Defines environment-driven settings for server orchestration."""
|
||||
|
||||
minio_endpoint: str
|
||||
minio_access_key: str
|
||||
minio_secret_key: str
|
||||
minio_secure: bool
|
||||
minio_public_endpoint: str
|
||||
minio_session_ttl_seconds: int
|
||||
conversion_cleanup_delay_seconds: int
|
||||
|
||||
|
||||
def load_server_config() -> ServerConfig:
|
||||
"""Load server configuration from environment variables."""
|
||||
return ServerConfig(
|
||||
minio_endpoint=os.getenv("MINIO_ENDPOINT", "localhost:9000"),
|
||||
minio_access_key=os.getenv("MINIO_ACCESS_KEY", "minioadmin"),
|
||||
minio_secret_key=os.getenv("MINIO_SECRET_KEY", "minioadmin"),
|
||||
minio_secure=os.getenv("MINIO_USE_SSL", "false").lower() == "true",
|
||||
minio_public_endpoint=os.getenv("MINIO_PUBLIC_ENDPOINT", "localhost:9000"),
|
||||
minio_session_ttl_seconds=int(os.getenv("MINIO_SESSION_TTL_SECONDS", "3600")),
|
||||
conversion_cleanup_delay_seconds=int(
|
||||
os.getenv("CONVERSION_CLEANUP_DELAY_SECONDS", "3600")
|
||||
),
|
||||
)
|
||||
@@ -0,0 +1,30 @@
|
||||
"""In-memory models representing conversion workflow state."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
def utc_now() -> datetime:
|
||||
"""Return the current UTC timestamp with timezone information."""
|
||||
return datetime.now(tz=timezone.utc)
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ConversionSession:
|
||||
"""Stores mutable state for a single conversion lifecycle."""
|
||||
|
||||
conversion_id: str
|
||||
source_filename: str
|
||||
bucket_name: str
|
||||
upload_object_key: str
|
||||
status: int
|
||||
created_at: datetime = field(default_factory=utc_now)
|
||||
updated_at: datetime = field(default_factory=utc_now)
|
||||
error_message: str = ""
|
||||
slide_deck: Any | None = None
|
||||
work_dir: Path | None = None
|
||||
conversion_task: Any | None = None
|
||||
cleanup_task: Any | None = None
|
||||
@@ -0,0 +1,269 @@
|
||||
"""Connect service implementation for conversion request orchestration."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
import tempfile
|
||||
import uuid
|
||||
|
||||
from connectrpc.code import Code
|
||||
from connectrpc.errors import ConnectError
|
||||
from connectrpc.request import RequestContext
|
||||
from google.protobuf.timestamp_pb2 import Timestamp
|
||||
from officeconvert import SlideArtifact, convert_pptx_to_slidedeck
|
||||
from officeconvertapi.v1 import conversion_connect, conversion_pb2
|
||||
|
||||
from officeconvert_server.config import ServerConfig
|
||||
from officeconvert_server.models import ConversionSession, utc_now
|
||||
from officeconvert_server.storage import MinIOStore
|
||||
|
||||
|
||||
class ConversionServiceImpl(conversion_connect.ConversionService):
|
||||
"""Implements the conversion API with in-memory state and MinIO orchestration."""
|
||||
|
||||
def __init__(self, config: ServerConfig, store: MinIOStore) -> None:
|
||||
"""Initialize service with runtime config and storage adapter."""
|
||||
self._config = config
|
||||
self._store = store
|
||||
self._sessions: dict[str, ConversionSession] = {}
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def create_conversion(
|
||||
self,
|
||||
request: conversion_pb2.CreateConversionRequest,
|
||||
ctx: RequestContext,
|
||||
) -> conversion_pb2.CreateConversionResponse:
|
||||
"""Create a new conversion session and return upload credentials."""
|
||||
del ctx
|
||||
source_filename = request.source_filename.strip()
|
||||
if not source_filename:
|
||||
raise ConnectError(Code.INVALID_ARGUMENT, "source_filename is required")
|
||||
if not source_filename.lower().endswith(".pptx"):
|
||||
raise ConnectError(Code.INVALID_ARGUMENT, "only .pptx input is supported")
|
||||
|
||||
conversion_id = str(uuid.uuid4())
|
||||
bucket_name = f"oc-{conversion_id}"
|
||||
upload_key = "input/source.pptx"
|
||||
expires_at = utc_now() + timedelta(seconds=self._config.minio_session_ttl_seconds)
|
||||
|
||||
self._store.ensure_bucket(bucket_name)
|
||||
upload_url = self._store.presigned_put_url(
|
||||
bucket_name,
|
||||
upload_key,
|
||||
ttl_seconds=self._config.minio_session_ttl_seconds,
|
||||
)
|
||||
|
||||
session = ConversionSession(
|
||||
conversion_id=conversion_id,
|
||||
source_filename=source_filename,
|
||||
bucket_name=bucket_name,
|
||||
upload_object_key=upload_key,
|
||||
status=conversion_pb2.CONVERSION_STATUS_PENDING,
|
||||
)
|
||||
async with self._lock:
|
||||
self._sessions[conversion_id] = session
|
||||
|
||||
return conversion_pb2.CreateConversionResponse(
|
||||
conversion_id=conversion_id,
|
||||
upload_bucket=bucket_name,
|
||||
upload_object_key=upload_key,
|
||||
upload_url=upload_url,
|
||||
expires_at=_to_timestamp(expires_at),
|
||||
)
|
||||
|
||||
async def start_conversion(
|
||||
self,
|
||||
request: conversion_pb2.StartConversionRequest,
|
||||
ctx: RequestContext,
|
||||
) -> conversion_pb2.StartConversionResponse:
|
||||
"""Start asynchronous conversion for an already-uploaded session payload."""
|
||||
del ctx
|
||||
session = await self._get_session(request.conversion_id)
|
||||
async with self._lock:
|
||||
if session.status == conversion_pb2.CONVERSION_STATUS_RUNNING:
|
||||
return conversion_pb2.StartConversionResponse(
|
||||
conversion_id=session.conversion_id,
|
||||
status=session.status,
|
||||
)
|
||||
if session.status in (
|
||||
conversion_pb2.CONVERSION_STATUS_FAILED,
|
||||
conversion_pb2.CONVERSION_STATUS_SUCCEEDED,
|
||||
):
|
||||
raise ConnectError(
|
||||
Code.FAILED_PRECONDITION,
|
||||
"conversion has already completed",
|
||||
)
|
||||
|
||||
session.status = conversion_pb2.CONVERSION_STATUS_RUNNING
|
||||
session.updated_at = utc_now()
|
||||
session.conversion_task = asyncio.create_task(self._run_conversion(session))
|
||||
|
||||
return conversion_pb2.StartConversionResponse(
|
||||
conversion_id=session.conversion_id,
|
||||
status=session.status,
|
||||
)
|
||||
|
||||
async def get_conversion_status(
|
||||
self,
|
||||
request: conversion_pb2.GetConversionStatusRequest,
|
||||
ctx: RequestContext,
|
||||
) -> conversion_pb2.GetConversionStatusResponse:
|
||||
"""Return current conversion status and optional error details."""
|
||||
del ctx
|
||||
session = await self._get_session(request.conversion_id)
|
||||
return conversion_pb2.GetConversionStatusResponse(
|
||||
conversion_id=session.conversion_id,
|
||||
status=session.status,
|
||||
error_message=session.error_message,
|
||||
updated_at=_to_timestamp(session.updated_at),
|
||||
)
|
||||
|
||||
async def get_slide_deck(
|
||||
self,
|
||||
request: conversion_pb2.GetSlideDeckRequest,
|
||||
ctx: RequestContext,
|
||||
) -> conversion_pb2.GetSlideDeckResponse:
|
||||
"""Return the finished slide deck once conversion succeeds."""
|
||||
del ctx
|
||||
session = await self._get_session(request.conversion_id)
|
||||
if session.status == conversion_pb2.CONVERSION_STATUS_FAILED:
|
||||
raise ConnectError(Code.FAILED_PRECONDITION, session.error_message)
|
||||
if session.status != conversion_pb2.CONVERSION_STATUS_SUCCEEDED:
|
||||
raise ConnectError(Code.FAILED_PRECONDITION, "conversion is not finished yet")
|
||||
if session.slide_deck is None:
|
||||
raise ConnectError(Code.INTERNAL, "slide deck missing from successful session")
|
||||
|
||||
return conversion_pb2.GetSlideDeckResponse(slide_deck=session.slide_deck)
|
||||
|
||||
async def delete_conversion(
|
||||
self,
|
||||
request: conversion_pb2.DeleteConversionRequest,
|
||||
ctx: RequestContext,
|
||||
) -> conversion_pb2.DeleteConversionResponse:
|
||||
"""Delete a conversion session and associated MinIO/local artifacts."""
|
||||
del ctx
|
||||
async with self._lock:
|
||||
session = self._sessions.pop(request.conversion_id, None)
|
||||
if session is None:
|
||||
return conversion_pb2.DeleteConversionResponse(
|
||||
conversion_id=request.conversion_id,
|
||||
deleted=False,
|
||||
)
|
||||
|
||||
if session.cleanup_task is not None:
|
||||
session.cleanup_task.cancel()
|
||||
if session.conversion_task is not None and not session.conversion_task.done():
|
||||
session.conversion_task.cancel()
|
||||
await self._cleanup_local_artifacts(session)
|
||||
await asyncio.to_thread(self._store.remove_bucket_tree, session.bucket_name)
|
||||
return conversion_pb2.DeleteConversionResponse(
|
||||
conversion_id=session.conversion_id,
|
||||
deleted=True,
|
||||
)
|
||||
|
||||
async def _run_conversion(self, session: ConversionSession) -> None:
|
||||
"""Execute conversion flow and persist terminal state in memory."""
|
||||
work_dir = Path(
|
||||
tempfile.mkdtemp(prefix=f"officeconvert-{session.conversion_id}-")
|
||||
).resolve()
|
||||
session.work_dir = work_dir
|
||||
source_path = work_dir / "input.pptx"
|
||||
try:
|
||||
await asyncio.to_thread(
|
||||
self._store.fget_object,
|
||||
session.bucket_name,
|
||||
session.upload_object_key,
|
||||
source_path,
|
||||
)
|
||||
result = await asyncio.to_thread(
|
||||
convert_pptx_to_slidedeck,
|
||||
source_path,
|
||||
work_dir,
|
||||
)
|
||||
session.slide_deck = await asyncio.to_thread(
|
||||
self._upload_and_build_slide_deck,
|
||||
session,
|
||||
result.slides,
|
||||
result.source_filename,
|
||||
)
|
||||
session.status = conversion_pb2.CONVERSION_STATUS_SUCCEEDED
|
||||
session.updated_at = utc_now()
|
||||
except asyncio.CancelledError:
|
||||
session.status = conversion_pb2.CONVERSION_STATUS_FAILED
|
||||
session.error_message = "conversion cancelled"
|
||||
session.updated_at = utc_now()
|
||||
raise
|
||||
except Exception as exc:
|
||||
session.status = conversion_pb2.CONVERSION_STATUS_FAILED
|
||||
session.error_message = str(exc)
|
||||
session.updated_at = utc_now()
|
||||
finally:
|
||||
await self._cleanup_local_artifacts(session)
|
||||
session.cleanup_task = asyncio.create_task(self._delayed_cleanup(session))
|
||||
|
||||
def _upload_and_build_slide_deck(
|
||||
self,
|
||||
session: ConversionSession,
|
||||
slides: list[SlideArtifact],
|
||||
source_filename: str,
|
||||
) -> conversion_pb2.SlideDeck:
|
||||
"""Upload generated slide images and construct API response payload."""
|
||||
response_slides: list[conversion_pb2.Slide] = []
|
||||
for slide in slides:
|
||||
object_key = f"output/slide-{slide.index:04d}{slide.image_path.suffix}"
|
||||
self._store.fput_object(session.bucket_name, object_key, slide.image_path)
|
||||
image_url = self._store.presigned_get_url(
|
||||
session.bucket_name,
|
||||
object_key,
|
||||
ttl_seconds=self._config.minio_session_ttl_seconds,
|
||||
)
|
||||
response_slides.append(
|
||||
conversion_pb2.Slide(
|
||||
index=slide.index,
|
||||
notes_plain=slide.notes_plain,
|
||||
image_url=image_url,
|
||||
)
|
||||
)
|
||||
|
||||
return conversion_pb2.SlideDeck(
|
||||
conversion_id=session.conversion_id,
|
||||
source_filename=source_filename,
|
||||
slides=response_slides,
|
||||
created_at=_to_timestamp(utc_now()),
|
||||
)
|
||||
|
||||
async def _delayed_cleanup(self, session: ConversionSession) -> None:
|
||||
"""Delete storage resources after the configured session retention period."""
|
||||
try:
|
||||
await asyncio.sleep(self._config.conversion_cleanup_delay_seconds)
|
||||
await asyncio.to_thread(self._store.remove_bucket_tree, session.bucket_name)
|
||||
except asyncio.CancelledError:
|
||||
return
|
||||
finally:
|
||||
async with self._lock:
|
||||
self._sessions.pop(session.conversion_id, None)
|
||||
|
||||
async def _cleanup_local_artifacts(self, session: ConversionSession) -> None:
|
||||
"""Delete temporary local files for a session if they still exist."""
|
||||
if session.work_dir is not None and session.work_dir.exists():
|
||||
await asyncio.to_thread(shutil.rmtree, session.work_dir, True)
|
||||
session.work_dir = None
|
||||
|
||||
async def _get_session(self, conversion_id: str) -> ConversionSession:
|
||||
"""Return an existing session or raise a NOT_FOUND error."""
|
||||
async with self._lock:
|
||||
session = self._sessions.get(conversion_id)
|
||||
if session is None:
|
||||
raise ConnectError(Code.NOT_FOUND, "conversion_id not found")
|
||||
return session
|
||||
|
||||
|
||||
def _to_timestamp(value: datetime) -> Timestamp:
|
||||
"""Convert a timezone-aware datetime to protobuf Timestamp."""
|
||||
normalized = value.astimezone(timezone.utc)
|
||||
proto = Timestamp()
|
||||
proto.FromDatetime(normalized)
|
||||
return proto
|
||||
@@ -0,0 +1,94 @@
|
||||
"""MinIO helper abstraction for upload and artifact lifecycle."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import timedelta
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from minio import Minio
|
||||
from minio.deleteobjects import DeleteObject
|
||||
from minio.error import S3Error
|
||||
|
||||
|
||||
class MinIOStore:
|
||||
"""Provides typed helper methods around MinIO object storage operations."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
endpoint: str,
|
||||
access_key: str,
|
||||
secret_key: str,
|
||||
secure: bool,
|
||||
public_endpoint: str,
|
||||
) -> None:
|
||||
"""Initialize MinIO clients for internal and public URL generation."""
|
||||
self._client = Minio(
|
||||
endpoint,
|
||||
access_key=access_key,
|
||||
secret_key=secret_key,
|
||||
secure=secure,
|
||||
)
|
||||
self._public_client = Minio(
|
||||
public_endpoint,
|
||||
access_key=access_key,
|
||||
secret_key=secret_key,
|
||||
secure=secure,
|
||||
)
|
||||
|
||||
def ensure_bucket(self, bucket_name: str) -> None:
|
||||
"""Create a bucket if it does not already exist."""
|
||||
if not self._client.bucket_exists(bucket_name):
|
||||
self._client.make_bucket(bucket_name)
|
||||
|
||||
def presigned_put_url(self, bucket_name: str, object_key: str, *, ttl_seconds: int) -> str:
|
||||
"""Generate a presigned PUT URL for a single object upload."""
|
||||
return self._public_client.presigned_put_object(
|
||||
bucket_name,
|
||||
object_key,
|
||||
expires=timedelta(seconds=ttl_seconds),
|
||||
)
|
||||
|
||||
def presigned_get_url(self, bucket_name: str, object_key: str, *, ttl_seconds: int) -> str:
|
||||
"""Generate a presigned GET URL for downloading one object."""
|
||||
return self._public_client.presigned_get_object(
|
||||
bucket_name,
|
||||
object_key,
|
||||
expires=timedelta(seconds=ttl_seconds),
|
||||
)
|
||||
|
||||
def fget_object(self, bucket_name: str, object_key: str, output_path: Path) -> None:
|
||||
"""Download one object from MinIO to a local filesystem path."""
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._client.fget_object(bucket_name, object_key, str(output_path))
|
||||
|
||||
def fput_object(self, bucket_name: str, object_key: str, source_path: Path) -> None:
|
||||
"""Upload one local filesystem object to MinIO."""
|
||||
self._client.fput_object(bucket_name, object_key, str(source_path))
|
||||
|
||||
def remove_bucket_tree(self, bucket_name: str) -> None:
|
||||
"""Remove all objects in a bucket and then delete the bucket."""
|
||||
objects = list(self._client.list_objects(bucket_name, recursive=True))
|
||||
if objects:
|
||||
errors = self._client.remove_objects(
|
||||
bucket_name,
|
||||
[DeleteObject(obj.object_name) for obj in objects],
|
||||
)
|
||||
for err in errors:
|
||||
raise RuntimeError(
|
||||
f"failed to delete object {err.object_name}: {err.message}"
|
||||
)
|
||||
try:
|
||||
self._client.remove_bucket(bucket_name)
|
||||
except S3Error as exc:
|
||||
# Concurrent cleanup paths may race to remove the same bucket.
|
||||
if exc.code != "NoSuchBucket":
|
||||
raise
|
||||
|
||||
|
||||
def object_key_from_presigned_url(url: str) -> str:
|
||||
"""Extract object key from a presigned URL path for diagnostics."""
|
||||
path = urlparse(url).path
|
||||
path_parts = [part for part in path.split("/") if part]
|
||||
return "/".join(path_parts[1:]) if len(path_parts) >= 2 else ""
|
||||
@@ -0,0 +1,11 @@
|
||||
[project]
|
||||
name = "officeconvert-workspace"
|
||||
version = "0.1.0"
|
||||
description = "Workspace root for officeconvert Python packages."
|
||||
requires-python = ">=3.12"
|
||||
|
||||
[tool.uv.workspace]
|
||||
members = [
|
||||
"packages/officeconvert",
|
||||
"packages/server",
|
||||
]
|
||||
Reference in New Issue
Block a user