From 42b9ae85a8ad7fbb68931fc52f79a2dce26c2166 Mon Sep 17 00:00:00 2001 From: Elijah Duffy Date: Wed, 17 Jun 2026 11:09:52 -0700 Subject: [PATCH] use a single bucket rather than one per conversion --- .env.example | 1 + Makefile | 1 + README.md | 1 + deploy/docker-compose.yml | 1 + docker-compose.yml | 1 + .../server/src/officeconvert_server/app.py | 14 ++++- .../server/src/officeconvert_server/config.py | 5 ++ .../server/src/officeconvert_server/models.py | 2 +- .../src/officeconvert_server/service.py | 49 ++++++++--------- .../src/officeconvert_server/storage.py | 53 ++++++++++--------- 10 files changed, 77 insertions(+), 51 deletions(-) diff --git a/.env.example b/.env.example index 279a1d2..56881df 100644 --- a/.env.example +++ b/.env.example @@ -1,5 +1,6 @@ S3_ENDPOINT=seaweedfs:8333 S3_PUBLIC_ENDPOINT=localhost:8333 +S3_BUCKET=officeconvert S3_USE_SSL=false # Presigned URLs; omit to match S3_USE_SSL (internal client uses S3_ENDPOINT). S3_PUBLIC_USE_SSL=false diff --git a/Makefile b/Makefile index ad97526..30473d3 100644 --- a/Makefile +++ b/Makefile @@ -41,6 +41,7 @@ run-server: if [ "$${S3_PUBLIC_ENDPOINT:-}" = "seaweedfs:8333" ]; then S3_PUBLIC_ENDPOINT=localhost:8333; fi; \ export S3_ENDPOINT="$${S3_ENDPOINT:-localhost:8333}"; \ export S3_PUBLIC_ENDPOINT="$${S3_PUBLIC_ENDPOINT:-localhost:8333}"; \ + export S3_BUCKET="$${S3_BUCKET:-officeconvert}"; \ export S3_USE_SSL="$${S3_USE_SSL:-false}"; \ export S3_ACCESS_KEY="$${S3_ACCESS_KEY:-minioadmin}"; \ export S3_SECRET_KEY="$${S3_SECRET_KEY:-minioadmin}"; \ diff --git a/README.md b/README.md index 272a509..d54c9f6 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,7 @@ Use `.env.example` as your baseline env configuration. - This project defaults to **SeaweedFS S3 API** for object transit in development and compose deployments. - The Python server uses the `minio` Python SDK, which is intentional because SeaweedFS is S3-compatible. - Runtime configuration uses `S3_*` environment variables. +- All conversions share one bucket (`S3_BUCKET`, required). Each conversion's objects live under a `{conversion_id}/` key prefix (for example `{conversion_id}/input/source.pptx` and `{conversion_id}/output/slide-0001.jpg`). ## Conversion Tuning Notes diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml index 094a230..ba5bcb0 100644 --- a/deploy/docker-compose.yml +++ b/deploy/docker-compose.yml @@ -23,6 +23,7 @@ services: environment: S3_ENDPOINT: ${S3_ENDPOINT:-seaweedfs:8333} S3_PUBLIC_ENDPOINT: ${S3_PUBLIC_ENDPOINT:-localhost:8333} + S3_BUCKET: ${S3_BUCKET:-officeconvert} S3_USE_SSL: ${S3_USE_SSL:-false} S3_ACCESS_KEY: ${S3_ACCESS_KEY:-minioadmin} S3_SECRET_KEY: ${S3_SECRET_KEY:-minioadmin} diff --git a/docker-compose.yml b/docker-compose.yml index 515b036..a9b67f4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -22,6 +22,7 @@ services: environment: S3_ENDPOINT: ${S3_ENDPOINT:-seaweedfs:8333} S3_PUBLIC_ENDPOINT: ${S3_PUBLIC_ENDPOINT:-localhost:8333} + S3_BUCKET: ${S3_BUCKET:-officeconvert} S3_USE_SSL: ${S3_USE_SSL:-false} S3_ACCESS_KEY: ${S3_ACCESS_KEY:-minioadmin} S3_SECRET_KEY: ${S3_SECRET_KEY:-minioadmin} diff --git a/python/packages/server/src/officeconvert_server/app.py b/python/packages/server/src/officeconvert_server/app.py index 9da69d0..735c2a2 100644 --- a/python/packages/server/src/officeconvert_server/app.py +++ b/python/packages/server/src/officeconvert_server/app.py @@ -10,7 +10,9 @@ from officeconvertapi.v1.conversion_connect import ConversionServiceASGIApplicat from officeconvert_server.config import load_server_config from officeconvert_server.service import ConversionServiceImpl -from officeconvert_server.storage import S3Store +from officeconvert_server.storage import S3Store, log_s3_error + +from minio.error import S3Error logger = logging.getLogger(__name__) @@ -55,6 +57,16 @@ def create_app() -> ConversionServiceASGIApplication: if os.getenv("OFFICECONVERT_S3_TRACE", "").lower() in ("1", "true", "yes"): store.enable_http_trace(sys.stderr) logger.warning("OFFICECONVERT_S3_TRACE enabled: S3 HTTP dumps on stderr") + try: + store.ensure_bucket(config.s3_bucket) + except S3Error as exc: + log_s3_error( + "ensure_bucket", + endpoint=config.s3_endpoint, + secure=config.s3_secure, + exc=exc, + ) + raise service = ConversionServiceImpl(config=config, store=store) return ConversionServiceASGIApplication(service) diff --git a/python/packages/server/src/officeconvert_server/config.py b/python/packages/server/src/officeconvert_server/config.py index 3f9af6d..f64210f 100644 --- a/python/packages/server/src/officeconvert_server/config.py +++ b/python/packages/server/src/officeconvert_server/config.py @@ -10,6 +10,7 @@ import os class ServerConfig: """Defines environment-driven settings for server orchestration.""" + s3_bucket: str s3_endpoint: str s3_access_key: str s3_secret_key: str @@ -37,7 +38,11 @@ def load_server_config() -> ServerConfig: else s3_secure ) region_env = os.getenv("S3_REGION", "").strip() + s3_bucket = os.getenv("S3_BUCKET", "").strip() + if not s3_bucket: + raise ValueError("S3_BUCKET is required") return ServerConfig( + s3_bucket=s3_bucket, s3_endpoint=os.getenv("S3_ENDPOINT", "localhost:8333"), s3_access_key=os.getenv("S3_ACCESS_KEY", "minioadmin"), s3_secret_key=os.getenv("S3_SECRET_KEY", "minioadmin"), diff --git a/python/packages/server/src/officeconvert_server/models.py b/python/packages/server/src/officeconvert_server/models.py index 1c835b2..e0e1d34 100644 --- a/python/packages/server/src/officeconvert_server/models.py +++ b/python/packages/server/src/officeconvert_server/models.py @@ -23,7 +23,7 @@ class ConversionSession: thumbnail_resolution: conversion_pb2.ConversionResolution full_jpeg_quality: int thumbnail_jpeg_quality: int - bucket_name: str + object_prefix: str upload_object_key: str status: conversion_pb2.ConversionStatus notes: conversion_pb2.NotesOptions | None = None diff --git a/python/packages/server/src/officeconvert_server/service.py b/python/packages/server/src/officeconvert_server/service.py index d2a0a8a..584c224 100644 --- a/python/packages/server/src/officeconvert_server/service.py +++ b/python/packages/server/src/officeconvert_server/service.py @@ -122,23 +122,13 @@ class ConversionServiceImpl(conversion_connect.ConversionService): ksuid = Ksuid() conversion_id = str(ksuid) - bucket_name = f"oc-{bytes(ksuid).hex()}" - upload_key = "input/source.pptx" + object_prefix = f"{conversion_id}/" + upload_key = f"{object_prefix}input/source.pptx" expires_at = utc_now() + timedelta(seconds=self._config.s3_session_ttl_seconds) - try: - self._store.ensure_bucket(bucket_name) - except S3Error as exc: - log_s3_error( - "ensure_bucket", - endpoint=self._config.s3_endpoint, - secure=self._config.s3_secure, - exc=exc, - ) - raise try: upload_url = self._store.presigned_put_url( - bucket_name, + self._config.s3_bucket, upload_key, ttl_seconds=self._config.s3_session_ttl_seconds, ) @@ -159,7 +149,7 @@ class ConversionServiceImpl(conversion_connect.ConversionService): full_jpeg_quality=full_jpeg_quality, thumbnail_jpeg_quality=thumbnail_jpeg_quality, notes=request.notes if request.HasField("notes") else None, - bucket_name=bucket_name, + object_prefix=object_prefix, upload_object_key=upload_key, status=conversion_pb2.CONVERSION_STATUS_PENDING, ) @@ -168,7 +158,7 @@ class ConversionServiceImpl(conversion_connect.ConversionService): return conversion_pb2.CreateConversionResponse( conversion_id=conversion_id, - upload_bucket=bucket_name, + upload_bucket=self._config.s3_bucket, upload_object_key=upload_key, upload_url=upload_url, expires_at=_to_timestamp(expires_at), @@ -265,7 +255,11 @@ class ConversionServiceImpl(conversion_connect.ConversionService): if session.conversion_task is not None and not session.conversion_task.done(): session.conversion_task.cancel() await self._cleanup_local_artifacts(session) - await asyncio.to_thread(self._store.remove_bucket_tree, session.bucket_name) + await asyncio.to_thread( + self._store.remove_prefix, + self._config.s3_bucket, + session.object_prefix, + ) return conversion_pb2.DeleteConversionResponse( conversion_id=session.conversion_id, deleted=True, @@ -295,7 +289,7 @@ class ConversionServiceImpl(conversion_connect.ConversionService): try: await asyncio.to_thread( self._store.fget_object, - session.bucket_name, + self._config.s3_bucket, session.upload_object_key, source_path, ) @@ -436,10 +430,12 @@ class ConversionServiceImpl(conversion_connect.ConversionService): upload_total = slide_total * 2 upload_index = 0 for slide in slides: - object_key = f"output/slide-{slide.index:04d}{slide.image_path.suffix}" - self._store.fput_object(session.bucket_name, object_key, slide.image_path) + object_key = ( + f"{session.object_prefix}output/slide-{slide.index:04d}{slide.image_path.suffix}" + ) + self._store.fput_object(self._config.s3_bucket, object_key, slide.image_path) image_url = self._store.presigned_get_url( - session.bucket_name, + self._config.s3_bucket, object_key, ttl_seconds=self._config.s3_session_ttl_seconds, ) @@ -447,15 +443,16 @@ class ConversionServiceImpl(conversion_connect.ConversionService): if progress_callback is not None: progress_callback(upload_index, upload_total) thumbnail_object_key = ( - f"output/thumb/slide-{slide.index:04d}{slide.thumbnail_path.suffix}" + f"{session.object_prefix}output/thumb/slide-{slide.index:04d}" + f"{slide.thumbnail_path.suffix}" ) self._store.fput_object( - session.bucket_name, + self._config.s3_bucket, thumbnail_object_key, slide.thumbnail_path, ) thumbnail_image_url = self._store.presigned_get_url( - session.bucket_name, + self._config.s3_bucket, thumbnail_object_key, ttl_seconds=self._config.s3_session_ttl_seconds, ) @@ -515,7 +512,11 @@ class ConversionServiceImpl(conversion_connect.ConversionService): """Delete storage resources after the configured session retention period.""" try: await asyncio.sleep(self._config.conversion_cleanup_delay_seconds) - await asyncio.to_thread(self._store.remove_bucket_tree, session.bucket_name) + await asyncio.to_thread( + self._store.remove_prefix, + self._config.s3_bucket, + session.object_prefix, + ) except asyncio.CancelledError: return finally: diff --git a/python/packages/server/src/officeconvert_server/storage.py b/python/packages/server/src/officeconvert_server/storage.py index 6c193f8..1b843ea 100644 --- a/python/packages/server/src/officeconvert_server/storage.py +++ b/python/packages/server/src/officeconvert_server/storage.py @@ -123,35 +123,38 @@ class S3Store: """Upload one local filesystem object to storage.""" self._client.fput_object(bucket_name, object_key, str(source_path)) - def remove_bucket_tree(self, bucket_name: str) -> None: - """Remove all objects in a bucket and then delete the bucket.""" - objects = list(self._client.list_objects(bucket_name, recursive=True)) - if objects: - delete_requests: list[DeleteObject] = [] - for obj in objects: - object_name = obj.object_name - if object_name is None: - raise RuntimeError( - "encountered unnamed object while removing bucket contents" - ) - delete_requests.append(DeleteObject(object_name)) - - errors = self._client.remove_objects( + def remove_prefix(self, bucket_name: str, prefix: str) -> None: + """Remove all objects under a key prefix within a bucket.""" + normalized_prefix = prefix if prefix.endswith("/") else f"{prefix}/" + objects = list( + self._client.list_objects( bucket_name, - delete_requests, + prefix=normalized_prefix, + recursive=True, ) - for err in errors: - object_name = err.name or "" - message = err.message or err.code + ) + if not objects: + return + + delete_requests: list[DeleteObject] = [] + for obj in objects: + object_name = obj.object_name + if object_name is None: raise RuntimeError( - f"failed to delete object {object_name}: {message}" + "encountered unnamed object while removing prefix contents" ) - try: - self._client.remove_bucket(bucket_name) - except S3Error as exc: - # Concurrent cleanup paths may race to remove the same bucket. - if exc.code != "NoSuchBucket": - raise + delete_requests.append(DeleteObject(object_name)) + + errors = self._client.remove_objects( + bucket_name, + delete_requests, + ) + for err in errors: + object_name = err.name or "" + message = err.message or err.code + raise RuntimeError( + f"failed to delete object {object_name}: {message}" + ) def object_key_from_presigned_url(url: str) -> str: