diff --git a/.env.example b/.env.example index 4b62825..0ab1d9b 100644 --- a/.env.example +++ b/.env.example @@ -7,4 +7,8 @@ S3_SESSION_TTL_SECONDS=3600 CONVERSION_IMAGE_DPI=150 CONVERSION_PPTX_TO_PDF_TIMEOUT_SECONDS=180 CONVERSION_PDF_TO_IMAGES_TIMEOUT_SECONDS=600 +CONVERSION_PPTX_TO_PDF_BASE_TIMEOUT_SECONDS=45 +CONVERSION_PPTX_TO_PDF_PER_SLIDE_TIMEOUT_SECONDS=3 +CONVERSION_PDF_TO_IMAGES_BASE_TIMEOUT_SECONDS=30 +CONVERSION_PDF_TO_IMAGES_PER_SLIDE_TIMEOUT_SECONDS=8 CONVERSION_CLEANUP_DELAY_SECONDS=3600 diff --git a/.gitignore b/.gitignore index 493f6be..4c16649 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,4 @@ coverage.out # But never track Python bytecode/cache artifacts from generated code. gen/**/__pycache__/ gen/**/*.py[cod] +.cache/ diff --git a/Makefile b/Makefile index b8ef363..0720e7d 100644 --- a/Makefile +++ b/Makefile @@ -35,8 +35,12 @@ run-server: export S3_ACCESS_KEY="$${S3_ACCESS_KEY:-minioadmin}"; \ export S3_SECRET_KEY="$${S3_SECRET_KEY:-minioadmin}"; \ export S3_SESSION_TTL_SECONDS="$${S3_SESSION_TTL_SECONDS:-3600}"; \ - export CONVERSION_IMAGE_DPI="$${CONVERSION_IMAGE_DPI:-150}"; \ + export CONVERSION_IMAGE_DPI="$${CONVERSION_IMAGE_DPI:-72}"; \ export CONVERSION_PPTX_TO_PDF_TIMEOUT_SECONDS="$${CONVERSION_PPTX_TO_PDF_TIMEOUT_SECONDS:-180}"; \ export CONVERSION_PDF_TO_IMAGES_TIMEOUT_SECONDS="$${CONVERSION_PDF_TO_IMAGES_TIMEOUT_SECONDS:-600}"; \ + export CONVERSION_PPTX_TO_PDF_BASE_TIMEOUT_SECONDS="$${CONVERSION_PPTX_TO_PDF_BASE_TIMEOUT_SECONDS:-45}"; \ + export CONVERSION_PPTX_TO_PDF_PER_SLIDE_TIMEOUT_SECONDS="$${CONVERSION_PPTX_TO_PDF_PER_SLIDE_TIMEOUT_SECONDS:-3}"; \ + export CONVERSION_PDF_TO_IMAGES_BASE_TIMEOUT_SECONDS="$${CONVERSION_PDF_TO_IMAGES_BASE_TIMEOUT_SECONDS:-30}"; \ + export CONVERSION_PDF_TO_IMAGES_PER_SLIDE_TIMEOUT_SECONDS="$${CONVERSION_PDF_TO_IMAGES_PER_SLIDE_TIMEOUT_SECONDS:-8}"; \ export CONVERSION_CLEANUP_DELAY_SECONDS="$${CONVERSION_CLEANUP_DELAY_SECONDS:-3600}"; \ uv run --project python --package officeconvert-server python -m uvicorn officeconvert_server.app:app --host "$${UVICORN_HOST:-0.0.0.0}" --port "$${UVICORN_PORT:-8080}" diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml index d8a1ae4..b04cf0c 100644 --- a/deploy/docker-compose.yml +++ b/deploy/docker-compose.yml @@ -27,9 +27,13 @@ services: S3_ACCESS_KEY: ${S3_ACCESS_KEY:-minioadmin} S3_SECRET_KEY: ${S3_SECRET_KEY:-minioadmin} S3_SESSION_TTL_SECONDS: ${S3_SESSION_TTL_SECONDS:-3600} - CONVERSION_IMAGE_DPI: ${CONVERSION_IMAGE_DPI:-150} + CONVERSION_IMAGE_DPI: ${CONVERSION_IMAGE_DPI:-72} CONVERSION_PPTX_TO_PDF_TIMEOUT_SECONDS: ${CONVERSION_PPTX_TO_PDF_TIMEOUT_SECONDS:-180} CONVERSION_PDF_TO_IMAGES_TIMEOUT_SECONDS: ${CONVERSION_PDF_TO_IMAGES_TIMEOUT_SECONDS:-600} + CONVERSION_PPTX_TO_PDF_BASE_TIMEOUT_SECONDS: ${CONVERSION_PPTX_TO_PDF_BASE_TIMEOUT_SECONDS:-45} + CONVERSION_PPTX_TO_PDF_PER_SLIDE_TIMEOUT_SECONDS: ${CONVERSION_PPTX_TO_PDF_PER_SLIDE_TIMEOUT_SECONDS:-3} + CONVERSION_PDF_TO_IMAGES_BASE_TIMEOUT_SECONDS: ${CONVERSION_PDF_TO_IMAGES_BASE_TIMEOUT_SECONDS:-30} + CONVERSION_PDF_TO_IMAGES_PER_SLIDE_TIMEOUT_SECONDS: ${CONVERSION_PDF_TO_IMAGES_PER_SLIDE_TIMEOUT_SECONDS:-8} CONVERSION_CLEANUP_DELAY_SECONDS: ${CONVERSION_CLEANUP_DELAY_SECONDS:-3600} ports: - "8080:8080" diff --git a/gen/go/officeconvertapi/v1/conversion.pb.go b/gen/go/officeconvertapi/v1/conversion.pb.go index 3d222da..d8fe3a7 100644 --- a/gen/go/officeconvertapi/v1/conversion.pb.go +++ b/gen/go/officeconvertapi/v1/conversion.pb.go @@ -78,6 +78,65 @@ func (ConversionStatus) EnumDescriptor() ([]byte, []int) { return file_officeconvertapi_v1_conversion_proto_rawDescGZIP(), []int{0} } +// ConversionPhase represents the active stage for a running conversion. +type ConversionPhase int32 + +const ( + ConversionPhase_CONVERSION_PHASE_UNSPECIFIED ConversionPhase = 0 + ConversionPhase_CONVERSION_PHASE_INACTIVE ConversionPhase = 1 + ConversionPhase_CONVERSION_PHASE_EXTRACTING_NOTES ConversionPhase = 2 + ConversionPhase_CONVERSION_PHASE_PPTX_TO_PDF ConversionPhase = 3 + ConversionPhase_CONVERSION_PHASE_PDF_TO_IMAGES ConversionPhase = 4 + ConversionPhase_CONVERSION_PHASE_UPLOADING_RESULTS ConversionPhase = 5 +) + +// Enum value maps for ConversionPhase. +var ( + ConversionPhase_name = map[int32]string{ + 0: "CONVERSION_PHASE_UNSPECIFIED", + 1: "CONVERSION_PHASE_INACTIVE", + 2: "CONVERSION_PHASE_EXTRACTING_NOTES", + 3: "CONVERSION_PHASE_PPTX_TO_PDF", + 4: "CONVERSION_PHASE_PDF_TO_IMAGES", + 5: "CONVERSION_PHASE_UPLOADING_RESULTS", + } + ConversionPhase_value = map[string]int32{ + "CONVERSION_PHASE_UNSPECIFIED": 0, + "CONVERSION_PHASE_INACTIVE": 1, + "CONVERSION_PHASE_EXTRACTING_NOTES": 2, + "CONVERSION_PHASE_PPTX_TO_PDF": 3, + "CONVERSION_PHASE_PDF_TO_IMAGES": 4, + "CONVERSION_PHASE_UPLOADING_RESULTS": 5, + } +) + +func (x ConversionPhase) Enum() *ConversionPhase { + p := new(ConversionPhase) + *p = x + return p +} + +func (x ConversionPhase) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (ConversionPhase) Descriptor() protoreflect.EnumDescriptor { + return file_officeconvertapi_v1_conversion_proto_enumTypes[1].Descriptor() +} + +func (ConversionPhase) Type() protoreflect.EnumType { + return &file_officeconvertapi_v1_conversion_proto_enumTypes[1] +} + +func (x ConversionPhase) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use ConversionPhase.Descriptor instead. +func (ConversionPhase) EnumDescriptor() ([]byte, []int) { + return file_officeconvertapi_v1_conversion_proto_rawDescGZIP(), []int{1} +} + // Slide contains extracted notes and the rendered image URL for one slide. type Slide struct { state protoimpl.MessageState `protogen:"open.v1"` @@ -475,13 +534,16 @@ func (x *GetConversionStatusRequest) GetConversionId() string { // GetConversionStatusResponse returns current status and optional error info. type GetConversionStatusResponse struct { - state protoimpl.MessageState `protogen:"open.v1"` - ConversionId string `protobuf:"bytes,1,opt,name=conversion_id,json=conversionId,proto3" json:"conversion_id,omitempty"` - Status ConversionStatus `protobuf:"varint,2,opt,name=status,proto3,enum=officeconvertapi.v1.ConversionStatus" json:"status,omitempty"` - ErrorMessage string `protobuf:"bytes,3,opt,name=error_message,json=errorMessage,proto3" json:"error_message,omitempty"` - UpdatedAt *timestamppb.Timestamp `protobuf:"bytes,4,opt,name=updated_at,json=updatedAt,proto3" json:"updated_at,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache + state protoimpl.MessageState `protogen:"open.v1"` + ConversionId string `protobuf:"bytes,1,opt,name=conversion_id,json=conversionId,proto3" json:"conversion_id,omitempty"` + Status ConversionStatus `protobuf:"varint,2,opt,name=status,proto3,enum=officeconvertapi.v1.ConversionStatus" json:"status,omitempty"` + ErrorMessage string `protobuf:"bytes,3,opt,name=error_message,json=errorMessage,proto3" json:"error_message,omitempty"` + UpdatedAt *timestamppb.Timestamp `protobuf:"bytes,4,opt,name=updated_at,json=updatedAt,proto3" json:"updated_at,omitempty"` + Phase ConversionPhase `protobuf:"varint,5,opt,name=phase,proto3,enum=officeconvertapi.v1.ConversionPhase" json:"phase,omitempty"` + CurrentProgress int32 `protobuf:"varint,6,opt,name=current_progress,json=currentProgress,proto3" json:"current_progress,omitempty"` + MaxProgress int32 `protobuf:"varint,7,opt,name=max_progress,json=maxProgress,proto3" json:"max_progress,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } func (x *GetConversionStatusResponse) Reset() { @@ -542,6 +604,27 @@ func (x *GetConversionStatusResponse) GetUpdatedAt() *timestamppb.Timestamp { return nil } +func (x *GetConversionStatusResponse) GetPhase() ConversionPhase { + if x != nil { + return x.Phase + } + return ConversionPhase_CONVERSION_PHASE_UNSPECIFIED +} + +func (x *GetConversionStatusResponse) GetCurrentProgress() int32 { + if x != nil { + return x.CurrentProgress + } + return 0 +} + +func (x *GetConversionStatusResponse) GetMaxProgress() int32 { + if x != nil { + return x.MaxProgress + } + return 0 +} + // GetSlideDeckRequest fetches a completed deck. type GetSlideDeckRequest struct { state protoimpl.MessageState `protogen:"open.v1"` @@ -762,13 +845,16 @@ const file_officeconvertapi_v1_conversion_proto_rawDesc = "" + "\rconversion_id\x18\x01 \x01(\tR\fconversionId\x12=\n" + "\x06status\x18\x02 \x01(\x0e2%.officeconvertapi.v1.ConversionStatusR\x06status\"A\n" + "\x1aGetConversionStatusRequest\x12#\n" + - "\rconversion_id\x18\x01 \x01(\tR\fconversionId\"\xe1\x01\n" + + "\rconversion_id\x18\x01 \x01(\tR\fconversionId\"\xeb\x02\n" + "\x1bGetConversionStatusResponse\x12#\n" + "\rconversion_id\x18\x01 \x01(\tR\fconversionId\x12=\n" + "\x06status\x18\x02 \x01(\x0e2%.officeconvertapi.v1.ConversionStatusR\x06status\x12#\n" + "\rerror_message\x18\x03 \x01(\tR\ferrorMessage\x129\n" + "\n" + - "updated_at\x18\x04 \x01(\v2\x1a.google.protobuf.TimestampR\tupdatedAt\":\n" + + "updated_at\x18\x04 \x01(\v2\x1a.google.protobuf.TimestampR\tupdatedAt\x12:\n" + + "\x05phase\x18\x05 \x01(\x0e2$.officeconvertapi.v1.ConversionPhaseR\x05phase\x12)\n" + + "\x10current_progress\x18\x06 \x01(\x05R\x0fcurrentProgress\x12!\n" + + "\fmax_progress\x18\a \x01(\x05R\vmaxProgress\":\n" + "\x13GetSlideDeckRequest\x12#\n" + "\rconversion_id\x18\x01 \x01(\tR\fconversionId\"U\n" + "\x14GetSlideDeckResponse\x12=\n" + @@ -784,7 +870,14 @@ const file_officeconvertapi_v1_conversion_proto_rawDesc = "" + "\x19CONVERSION_STATUS_PENDING\x10\x01\x12\x1d\n" + "\x19CONVERSION_STATUS_RUNNING\x10\x02\x12\x1f\n" + "\x1bCONVERSION_STATUS_SUCCEEDED\x10\x03\x12\x1c\n" + - "\x18CONVERSION_STATUS_FAILED\x10\x042\xcc\x04\n" + + "\x18CONVERSION_STATUS_FAILED\x10\x04*\xe7\x01\n" + + "\x0fConversionPhase\x12 \n" + + "\x1cCONVERSION_PHASE_UNSPECIFIED\x10\x00\x12\x1d\n" + + "\x19CONVERSION_PHASE_INACTIVE\x10\x01\x12%\n" + + "!CONVERSION_PHASE_EXTRACTING_NOTES\x10\x02\x12 \n" + + "\x1cCONVERSION_PHASE_PPTX_TO_PDF\x10\x03\x12\"\n" + + "\x1eCONVERSION_PHASE_PDF_TO_IMAGES\x10\x04\x12&\n" + + "\"CONVERSION_PHASE_UPLOADING_RESULTS\x10\x052\xcc\x04\n" + "\x11ConversionService\x12q\n" + "\x10CreateConversion\x12,.officeconvertapi.v1.CreateConversionRequest\x1a-.officeconvertapi.v1.CreateConversionResponse\"\x00\x12n\n" + "\x0fStartConversion\x12+.officeconvertapi.v1.StartConversionRequest\x1a,.officeconvertapi.v1.StartConversionResponse\"\x00\x12z\n" + @@ -804,47 +897,49 @@ func file_officeconvertapi_v1_conversion_proto_rawDescGZIP() []byte { return file_officeconvertapi_v1_conversion_proto_rawDescData } -var file_officeconvertapi_v1_conversion_proto_enumTypes = make([]protoimpl.EnumInfo, 1) +var file_officeconvertapi_v1_conversion_proto_enumTypes = make([]protoimpl.EnumInfo, 2) var file_officeconvertapi_v1_conversion_proto_msgTypes = make([]protoimpl.MessageInfo, 12) var file_officeconvertapi_v1_conversion_proto_goTypes = []any{ (ConversionStatus)(0), // 0: officeconvertapi.v1.ConversionStatus - (*Slide)(nil), // 1: officeconvertapi.v1.Slide - (*SlideDeck)(nil), // 2: officeconvertapi.v1.SlideDeck - (*CreateConversionRequest)(nil), // 3: officeconvertapi.v1.CreateConversionRequest - (*CreateConversionResponse)(nil), // 4: officeconvertapi.v1.CreateConversionResponse - (*StartConversionRequest)(nil), // 5: officeconvertapi.v1.StartConversionRequest - (*StartConversionResponse)(nil), // 6: officeconvertapi.v1.StartConversionResponse - (*GetConversionStatusRequest)(nil), // 7: officeconvertapi.v1.GetConversionStatusRequest - (*GetConversionStatusResponse)(nil), // 8: officeconvertapi.v1.GetConversionStatusResponse - (*GetSlideDeckRequest)(nil), // 9: officeconvertapi.v1.GetSlideDeckRequest - (*GetSlideDeckResponse)(nil), // 10: officeconvertapi.v1.GetSlideDeckResponse - (*DeleteConversionRequest)(nil), // 11: officeconvertapi.v1.DeleteConversionRequest - (*DeleteConversionResponse)(nil), // 12: officeconvertapi.v1.DeleteConversionResponse - (*timestamppb.Timestamp)(nil), // 13: google.protobuf.Timestamp + (ConversionPhase)(0), // 1: officeconvertapi.v1.ConversionPhase + (*Slide)(nil), // 2: officeconvertapi.v1.Slide + (*SlideDeck)(nil), // 3: officeconvertapi.v1.SlideDeck + (*CreateConversionRequest)(nil), // 4: officeconvertapi.v1.CreateConversionRequest + (*CreateConversionResponse)(nil), // 5: officeconvertapi.v1.CreateConversionResponse + (*StartConversionRequest)(nil), // 6: officeconvertapi.v1.StartConversionRequest + (*StartConversionResponse)(nil), // 7: officeconvertapi.v1.StartConversionResponse + (*GetConversionStatusRequest)(nil), // 8: officeconvertapi.v1.GetConversionStatusRequest + (*GetConversionStatusResponse)(nil), // 9: officeconvertapi.v1.GetConversionStatusResponse + (*GetSlideDeckRequest)(nil), // 10: officeconvertapi.v1.GetSlideDeckRequest + (*GetSlideDeckResponse)(nil), // 11: officeconvertapi.v1.GetSlideDeckResponse + (*DeleteConversionRequest)(nil), // 12: officeconvertapi.v1.DeleteConversionRequest + (*DeleteConversionResponse)(nil), // 13: officeconvertapi.v1.DeleteConversionResponse + (*timestamppb.Timestamp)(nil), // 14: google.protobuf.Timestamp } var file_officeconvertapi_v1_conversion_proto_depIdxs = []int32{ - 1, // 0: officeconvertapi.v1.SlideDeck.slides:type_name -> officeconvertapi.v1.Slide - 13, // 1: officeconvertapi.v1.SlideDeck.created_at:type_name -> google.protobuf.Timestamp - 13, // 2: officeconvertapi.v1.CreateConversionResponse.expires_at:type_name -> google.protobuf.Timestamp + 2, // 0: officeconvertapi.v1.SlideDeck.slides:type_name -> officeconvertapi.v1.Slide + 14, // 1: officeconvertapi.v1.SlideDeck.created_at:type_name -> google.protobuf.Timestamp + 14, // 2: officeconvertapi.v1.CreateConversionResponse.expires_at:type_name -> google.protobuf.Timestamp 0, // 3: officeconvertapi.v1.StartConversionResponse.status:type_name -> officeconvertapi.v1.ConversionStatus 0, // 4: officeconvertapi.v1.GetConversionStatusResponse.status:type_name -> officeconvertapi.v1.ConversionStatus - 13, // 5: officeconvertapi.v1.GetConversionStatusResponse.updated_at:type_name -> google.protobuf.Timestamp - 2, // 6: officeconvertapi.v1.GetSlideDeckResponse.slide_deck:type_name -> officeconvertapi.v1.SlideDeck - 3, // 7: officeconvertapi.v1.ConversionService.CreateConversion:input_type -> officeconvertapi.v1.CreateConversionRequest - 5, // 8: officeconvertapi.v1.ConversionService.StartConversion:input_type -> officeconvertapi.v1.StartConversionRequest - 7, // 9: officeconvertapi.v1.ConversionService.GetConversionStatus:input_type -> officeconvertapi.v1.GetConversionStatusRequest - 9, // 10: officeconvertapi.v1.ConversionService.GetSlideDeck:input_type -> officeconvertapi.v1.GetSlideDeckRequest - 11, // 11: officeconvertapi.v1.ConversionService.DeleteConversion:input_type -> officeconvertapi.v1.DeleteConversionRequest - 4, // 12: officeconvertapi.v1.ConversionService.CreateConversion:output_type -> officeconvertapi.v1.CreateConversionResponse - 6, // 13: officeconvertapi.v1.ConversionService.StartConversion:output_type -> officeconvertapi.v1.StartConversionResponse - 8, // 14: officeconvertapi.v1.ConversionService.GetConversionStatus:output_type -> officeconvertapi.v1.GetConversionStatusResponse - 10, // 15: officeconvertapi.v1.ConversionService.GetSlideDeck:output_type -> officeconvertapi.v1.GetSlideDeckResponse - 12, // 16: officeconvertapi.v1.ConversionService.DeleteConversion:output_type -> officeconvertapi.v1.DeleteConversionResponse - 12, // [12:17] is the sub-list for method output_type - 7, // [7:12] is the sub-list for method input_type - 7, // [7:7] is the sub-list for extension type_name - 7, // [7:7] is the sub-list for extension extendee - 0, // [0:7] is the sub-list for field type_name + 14, // 5: officeconvertapi.v1.GetConversionStatusResponse.updated_at:type_name -> google.protobuf.Timestamp + 1, // 6: officeconvertapi.v1.GetConversionStatusResponse.phase:type_name -> officeconvertapi.v1.ConversionPhase + 3, // 7: officeconvertapi.v1.GetSlideDeckResponse.slide_deck:type_name -> officeconvertapi.v1.SlideDeck + 4, // 8: officeconvertapi.v1.ConversionService.CreateConversion:input_type -> officeconvertapi.v1.CreateConversionRequest + 6, // 9: officeconvertapi.v1.ConversionService.StartConversion:input_type -> officeconvertapi.v1.StartConversionRequest + 8, // 10: officeconvertapi.v1.ConversionService.GetConversionStatus:input_type -> officeconvertapi.v1.GetConversionStatusRequest + 10, // 11: officeconvertapi.v1.ConversionService.GetSlideDeck:input_type -> officeconvertapi.v1.GetSlideDeckRequest + 12, // 12: officeconvertapi.v1.ConversionService.DeleteConversion:input_type -> officeconvertapi.v1.DeleteConversionRequest + 5, // 13: officeconvertapi.v1.ConversionService.CreateConversion:output_type -> officeconvertapi.v1.CreateConversionResponse + 7, // 14: officeconvertapi.v1.ConversionService.StartConversion:output_type -> officeconvertapi.v1.StartConversionResponse + 9, // 15: officeconvertapi.v1.ConversionService.GetConversionStatus:output_type -> officeconvertapi.v1.GetConversionStatusResponse + 11, // 16: officeconvertapi.v1.ConversionService.GetSlideDeck:output_type -> officeconvertapi.v1.GetSlideDeckResponse + 13, // 17: officeconvertapi.v1.ConversionService.DeleteConversion:output_type -> officeconvertapi.v1.DeleteConversionResponse + 13, // [13:18] is the sub-list for method output_type + 8, // [8:13] is the sub-list for method input_type + 8, // [8:8] is the sub-list for extension type_name + 8, // [8:8] is the sub-list for extension extendee + 0, // [0:8] is the sub-list for field type_name } func init() { file_officeconvertapi_v1_conversion_proto_init() } @@ -857,7 +952,7 @@ func file_officeconvertapi_v1_conversion_proto_init() { File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_officeconvertapi_v1_conversion_proto_rawDesc), len(file_officeconvertapi_v1_conversion_proto_rawDesc)), - NumEnums: 1, + NumEnums: 2, NumMessages: 12, NumExtensions: 0, NumServices: 1, diff --git a/gen/python/officeconvertapi/v1/conversion_pb2.py b/gen/python/officeconvertapi/v1/conversion_pb2.py index 6f76af6..c7a25d8 100644 --- a/gen/python/officeconvertapi/v1/conversion_pb2.py +++ b/gen/python/officeconvertapi/v1/conversion_pb2.py @@ -25,7 +25,7 @@ _sym_db = _symbol_database.Default() from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n$officeconvertapi/v1/conversion.proto\x12\x13officeconvertapi.v1\x1a\x1fgoogle/protobuf/timestamp.proto\"[\n\x05Slide\x12\x14\n\x05index\x18\x01 \x01(\x05R\x05index\x12\x1f\n\x0bnotes_plain\x18\x02 \x01(\tR\nnotesPlain\x12\x1b\n\timage_url\x18\x03 \x01(\tR\x08imageUrl\"\xc8\x01\n\tSlideDeck\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\x12\'\n\x0fsource_filename\x18\x02 \x01(\tR\x0esourceFilename\x12\x32\n\x06slides\x18\x03 \x03(\x0b\x32\x1a.officeconvertapi.v1.SlideR\x06slides\x12\x39\n\ncreated_at\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.TimestampR\tcreatedAt\"B\n\x17\x43reateConversionRequest\x12\'\n\x0fsource_filename\x18\x01 \x01(\tR\x0esourceFilename\"\xea\x01\n\x18\x43reateConversionResponse\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\x12#\n\rupload_bucket\x18\x02 \x01(\tR\x0cuploadBucket\x12*\n\x11upload_object_key\x18\x03 \x01(\tR\x0fuploadObjectKey\x12\x1d\n\nupload_url\x18\x04 \x01(\tR\tuploadUrl\x12\x39\n\nexpires_at\x18\x05 \x01(\x0b\x32\x1a.google.protobuf.TimestampR\texpiresAt\"=\n\x16StartConversionRequest\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\"}\n\x17StartConversionResponse\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\x12=\n\x06status\x18\x02 \x01(\x0e\x32%.officeconvertapi.v1.ConversionStatusR\x06status\"A\n\x1aGetConversionStatusRequest\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\"\xe1\x01\n\x1bGetConversionStatusResponse\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\x12=\n\x06status\x18\x02 \x01(\x0e\x32%.officeconvertapi.v1.ConversionStatusR\x06status\x12#\n\rerror_message\x18\x03 \x01(\tR\x0c\x65rrorMessage\x12\x39\n\nupdated_at\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.TimestampR\tupdatedAt\":\n\x13GetSlideDeckRequest\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\"U\n\x14GetSlideDeckResponse\x12=\n\nslide_deck\x18\x01 \x01(\x0b\x32\x1e.officeconvertapi.v1.SlideDeckR\tslideDeck\">\n\x17\x44\x65leteConversionRequest\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\"Y\n\x18\x44\x65leteConversionResponse\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\x12\x18\n\x07\x64\x65leted\x18\x02 \x01(\x08R\x07\x64\x65leted*\xb2\x01\n\x10\x43onversionStatus\x12!\n\x1d\x43ONVERSION_STATUS_UNSPECIFIED\x10\x00\x12\x1d\n\x19\x43ONVERSION_STATUS_PENDING\x10\x01\x12\x1d\n\x19\x43ONVERSION_STATUS_RUNNING\x10\x02\x12\x1f\n\x1b\x43ONVERSION_STATUS_SUCCEEDED\x10\x03\x12\x1c\n\x18\x43ONVERSION_STATUS_FAILED\x10\x04\x32\xcc\x04\n\x11\x43onversionService\x12q\n\x10\x43reateConversion\x12,.officeconvertapi.v1.CreateConversionRequest\x1a-.officeconvertapi.v1.CreateConversionResponse\"\x00\x12n\n\x0fStartConversion\x12+.officeconvertapi.v1.StartConversionRequest\x1a,.officeconvertapi.v1.StartConversionResponse\"\x00\x12z\n\x13GetConversionStatus\x12/.officeconvertapi.v1.GetConversionStatusRequest\x1a\x30.officeconvertapi.v1.GetConversionStatusResponse\"\x00\x12\x65\n\x0cGetSlideDeck\x12(.officeconvertapi.v1.GetSlideDeckRequest\x1a).officeconvertapi.v1.GetSlideDeckResponse\"\x00\x12q\n\x10\x44\x65leteConversion\x12,.officeconvertapi.v1.DeleteConversionRequest\x1a-.officeconvertapi.v1.DeleteConversionResponse\"\x00\x42LZJgithub.com/end/officeconvert/gen/go/officeconvertapi/v1;officeconvertapiv1b\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n$officeconvertapi/v1/conversion.proto\x12\x13officeconvertapi.v1\x1a\x1fgoogle/protobuf/timestamp.proto\"[\n\x05Slide\x12\x14\n\x05index\x18\x01 \x01(\x05R\x05index\x12\x1f\n\x0bnotes_plain\x18\x02 \x01(\tR\nnotesPlain\x12\x1b\n\timage_url\x18\x03 \x01(\tR\x08imageUrl\"\xc8\x01\n\tSlideDeck\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\x12\'\n\x0fsource_filename\x18\x02 \x01(\tR\x0esourceFilename\x12\x32\n\x06slides\x18\x03 \x03(\x0b\x32\x1a.officeconvertapi.v1.SlideR\x06slides\x12\x39\n\ncreated_at\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.TimestampR\tcreatedAt\"B\n\x17\x43reateConversionRequest\x12\'\n\x0fsource_filename\x18\x01 \x01(\tR\x0esourceFilename\"\xea\x01\n\x18\x43reateConversionResponse\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\x12#\n\rupload_bucket\x18\x02 \x01(\tR\x0cuploadBucket\x12*\n\x11upload_object_key\x18\x03 \x01(\tR\x0fuploadObjectKey\x12\x1d\n\nupload_url\x18\x04 \x01(\tR\tuploadUrl\x12\x39\n\nexpires_at\x18\x05 \x01(\x0b\x32\x1a.google.protobuf.TimestampR\texpiresAt\"=\n\x16StartConversionRequest\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\"}\n\x17StartConversionResponse\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\x12=\n\x06status\x18\x02 \x01(\x0e\x32%.officeconvertapi.v1.ConversionStatusR\x06status\"A\n\x1aGetConversionStatusRequest\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\"\xeb\x02\n\x1bGetConversionStatusResponse\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\x12=\n\x06status\x18\x02 \x01(\x0e\x32%.officeconvertapi.v1.ConversionStatusR\x06status\x12#\n\rerror_message\x18\x03 \x01(\tR\x0c\x65rrorMessage\x12\x39\n\nupdated_at\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.TimestampR\tupdatedAt\x12:\n\x05phase\x18\x05 \x01(\x0e\x32$.officeconvertapi.v1.ConversionPhaseR\x05phase\x12)\n\x10\x63urrent_progress\x18\x06 \x01(\x05R\x0f\x63urrentProgress\x12!\n\x0cmax_progress\x18\x07 \x01(\x05R\x0bmaxProgress\":\n\x13GetSlideDeckRequest\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\"U\n\x14GetSlideDeckResponse\x12=\n\nslide_deck\x18\x01 \x01(\x0b\x32\x1e.officeconvertapi.v1.SlideDeckR\tslideDeck\">\n\x17\x44\x65leteConversionRequest\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\"Y\n\x18\x44\x65leteConversionResponse\x12#\n\rconversion_id\x18\x01 \x01(\tR\x0c\x63onversionId\x12\x18\n\x07\x64\x65leted\x18\x02 \x01(\x08R\x07\x64\x65leted*\xb2\x01\n\x10\x43onversionStatus\x12!\n\x1d\x43ONVERSION_STATUS_UNSPECIFIED\x10\x00\x12\x1d\n\x19\x43ONVERSION_STATUS_PENDING\x10\x01\x12\x1d\n\x19\x43ONVERSION_STATUS_RUNNING\x10\x02\x12\x1f\n\x1b\x43ONVERSION_STATUS_SUCCEEDED\x10\x03\x12\x1c\n\x18\x43ONVERSION_STATUS_FAILED\x10\x04*\xe7\x01\n\x0f\x43onversionPhase\x12 \n\x1c\x43ONVERSION_PHASE_UNSPECIFIED\x10\x00\x12\x1d\n\x19\x43ONVERSION_PHASE_INACTIVE\x10\x01\x12%\n!CONVERSION_PHASE_EXTRACTING_NOTES\x10\x02\x12 \n\x1c\x43ONVERSION_PHASE_PPTX_TO_PDF\x10\x03\x12\"\n\x1e\x43ONVERSION_PHASE_PDF_TO_IMAGES\x10\x04\x12&\n\"CONVERSION_PHASE_UPLOADING_RESULTS\x10\x05\x32\xcc\x04\n\x11\x43onversionService\x12q\n\x10\x43reateConversion\x12,.officeconvertapi.v1.CreateConversionRequest\x1a-.officeconvertapi.v1.CreateConversionResponse\"\x00\x12n\n\x0fStartConversion\x12+.officeconvertapi.v1.StartConversionRequest\x1a,.officeconvertapi.v1.StartConversionResponse\"\x00\x12z\n\x13GetConversionStatus\x12/.officeconvertapi.v1.GetConversionStatusRequest\x1a\x30.officeconvertapi.v1.GetConversionStatusResponse\"\x00\x12\x65\n\x0cGetSlideDeck\x12(.officeconvertapi.v1.GetSlideDeckRequest\x1a).officeconvertapi.v1.GetSlideDeckResponse\"\x00\x12q\n\x10\x44\x65leteConversion\x12,.officeconvertapi.v1.DeleteConversionRequest\x1a-.officeconvertapi.v1.DeleteConversionResponse\"\x00\x42LZJgithub.com/end/officeconvert/gen/go/officeconvertapi/v1;officeconvertapiv1b\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) @@ -33,8 +33,10 @@ _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'officeconvertapi.v1.convers if not _descriptor._USE_C_DESCRIPTORS: _globals['DESCRIPTOR']._loaded_options = None _globals['DESCRIPTOR']._serialized_options = b'ZJgithub.com/end/officeconvert/gen/go/officeconvertapi/v1;officeconvertapiv1' - _globals['_CONVERSIONSTATUS']._serialized_start=1483 - _globals['_CONVERSIONSTATUS']._serialized_end=1661 + _globals['_CONVERSIONSTATUS']._serialized_start=1621 + _globals['_CONVERSIONSTATUS']._serialized_end=1799 + _globals['_CONVERSIONPHASE']._serialized_start=1802 + _globals['_CONVERSIONPHASE']._serialized_end=2033 _globals['_SLIDE']._serialized_start=94 _globals['_SLIDE']._serialized_end=185 _globals['_SLIDEDECK']._serialized_start=188 @@ -50,15 +52,15 @@ if not _descriptor._USE_C_DESCRIPTORS: _globals['_GETCONVERSIONSTATUSREQUEST']._serialized_start=885 _globals['_GETCONVERSIONSTATUSREQUEST']._serialized_end=950 _globals['_GETCONVERSIONSTATUSRESPONSE']._serialized_start=953 - _globals['_GETCONVERSIONSTATUSRESPONSE']._serialized_end=1178 - _globals['_GETSLIDEDECKREQUEST']._serialized_start=1180 - _globals['_GETSLIDEDECKREQUEST']._serialized_end=1238 - _globals['_GETSLIDEDECKRESPONSE']._serialized_start=1240 - _globals['_GETSLIDEDECKRESPONSE']._serialized_end=1325 - _globals['_DELETECONVERSIONREQUEST']._serialized_start=1327 - _globals['_DELETECONVERSIONREQUEST']._serialized_end=1389 - _globals['_DELETECONVERSIONRESPONSE']._serialized_start=1391 - _globals['_DELETECONVERSIONRESPONSE']._serialized_end=1480 - _globals['_CONVERSIONSERVICE']._serialized_start=1664 - _globals['_CONVERSIONSERVICE']._serialized_end=2252 + _globals['_GETCONVERSIONSTATUSRESPONSE']._serialized_end=1316 + _globals['_GETSLIDEDECKREQUEST']._serialized_start=1318 + _globals['_GETSLIDEDECKREQUEST']._serialized_end=1376 + _globals['_GETSLIDEDECKRESPONSE']._serialized_start=1378 + _globals['_GETSLIDEDECKRESPONSE']._serialized_end=1463 + _globals['_DELETECONVERSIONREQUEST']._serialized_start=1465 + _globals['_DELETECONVERSIONREQUEST']._serialized_end=1527 + _globals['_DELETECONVERSIONRESPONSE']._serialized_start=1529 + _globals['_DELETECONVERSIONRESPONSE']._serialized_end=1618 + _globals['_CONVERSIONSERVICE']._serialized_start=2036 + _globals['_CONVERSIONSERVICE']._serialized_end=2624 # @@protoc_insertion_point(module_scope) diff --git a/gen/python/officeconvertapi/v1/conversion_pb2.pyi b/gen/python/officeconvertapi/v1/conversion_pb2.pyi index 877b479..23d7116 100644 --- a/gen/python/officeconvertapi/v1/conversion_pb2.pyi +++ b/gen/python/officeconvertapi/v1/conversion_pb2.pyi @@ -17,11 +17,26 @@ class ConversionStatus(int, metaclass=_enum_type_wrapper.EnumTypeWrapper): CONVERSION_STATUS_RUNNING: _ClassVar[ConversionStatus] CONVERSION_STATUS_SUCCEEDED: _ClassVar[ConversionStatus] CONVERSION_STATUS_FAILED: _ClassVar[ConversionStatus] + +class ConversionPhase(int, metaclass=_enum_type_wrapper.EnumTypeWrapper): + __slots__ = () + CONVERSION_PHASE_UNSPECIFIED: _ClassVar[ConversionPhase] + CONVERSION_PHASE_INACTIVE: _ClassVar[ConversionPhase] + CONVERSION_PHASE_EXTRACTING_NOTES: _ClassVar[ConversionPhase] + CONVERSION_PHASE_PPTX_TO_PDF: _ClassVar[ConversionPhase] + CONVERSION_PHASE_PDF_TO_IMAGES: _ClassVar[ConversionPhase] + CONVERSION_PHASE_UPLOADING_RESULTS: _ClassVar[ConversionPhase] CONVERSION_STATUS_UNSPECIFIED: ConversionStatus CONVERSION_STATUS_PENDING: ConversionStatus CONVERSION_STATUS_RUNNING: ConversionStatus CONVERSION_STATUS_SUCCEEDED: ConversionStatus CONVERSION_STATUS_FAILED: ConversionStatus +CONVERSION_PHASE_UNSPECIFIED: ConversionPhase +CONVERSION_PHASE_INACTIVE: ConversionPhase +CONVERSION_PHASE_EXTRACTING_NOTES: ConversionPhase +CONVERSION_PHASE_PPTX_TO_PDF: ConversionPhase +CONVERSION_PHASE_PDF_TO_IMAGES: ConversionPhase +CONVERSION_PHASE_UPLOADING_RESULTS: ConversionPhase class Slide(_message.Message): __slots__ = ("index", "notes_plain", "image_url") @@ -86,16 +101,22 @@ class GetConversionStatusRequest(_message.Message): def __init__(self, conversion_id: _Optional[str] = ...) -> None: ... class GetConversionStatusResponse(_message.Message): - __slots__ = ("conversion_id", "status", "error_message", "updated_at") + __slots__ = ("conversion_id", "status", "error_message", "updated_at", "phase", "current_progress", "max_progress") CONVERSION_ID_FIELD_NUMBER: _ClassVar[int] STATUS_FIELD_NUMBER: _ClassVar[int] ERROR_MESSAGE_FIELD_NUMBER: _ClassVar[int] UPDATED_AT_FIELD_NUMBER: _ClassVar[int] + PHASE_FIELD_NUMBER: _ClassVar[int] + CURRENT_PROGRESS_FIELD_NUMBER: _ClassVar[int] + MAX_PROGRESS_FIELD_NUMBER: _ClassVar[int] conversion_id: str status: ConversionStatus error_message: str updated_at: _timestamp_pb2.Timestamp - def __init__(self, conversion_id: _Optional[str] = ..., status: _Optional[_Union[ConversionStatus, str]] = ..., error_message: _Optional[str] = ..., updated_at: _Optional[_Union[datetime.datetime, _timestamp_pb2.Timestamp, _Mapping]] = ...) -> None: ... + phase: ConversionPhase + current_progress: int + max_progress: int + def __init__(self, conversion_id: _Optional[str] = ..., status: _Optional[_Union[ConversionStatus, str]] = ..., error_message: _Optional[str] = ..., updated_at: _Optional[_Union[datetime.datetime, _timestamp_pb2.Timestamp, _Mapping]] = ..., phase: _Optional[_Union[ConversionPhase, str]] = ..., current_progress: _Optional[int] = ..., max_progress: _Optional[int] = ...) -> None: ... class GetSlideDeckRequest(_message.Message): __slots__ = ("conversion_id",) diff --git a/proto/officeconvertapi/v1/conversion.proto b/proto/officeconvertapi/v1/conversion.proto index a2e4cc0..e6ebba0 100644 --- a/proto/officeconvertapi/v1/conversion.proto +++ b/proto/officeconvertapi/v1/conversion.proto @@ -33,6 +33,16 @@ enum ConversionStatus { CONVERSION_STATUS_FAILED = 4; } +// ConversionPhase represents the active stage for a running conversion. +enum ConversionPhase { + CONVERSION_PHASE_UNSPECIFIED = 0; + CONVERSION_PHASE_INACTIVE = 1; + CONVERSION_PHASE_EXTRACTING_NOTES = 2; + CONVERSION_PHASE_PPTX_TO_PDF = 3; + CONVERSION_PHASE_PDF_TO_IMAGES = 4; + CONVERSION_PHASE_UPLOADING_RESULTS = 5; +} + // Slide contains extracted notes and the rendered image URL for one slide. message Slide { int32 index = 1; @@ -84,6 +94,9 @@ message GetConversionStatusResponse { ConversionStatus status = 2; string error_message = 3; google.protobuf.Timestamp updated_at = 4; + ConversionPhase phase = 5; + int32 current_progress = 6; + int32 max_progress = 7; } // GetSlideDeckRequest fetches a completed deck. diff --git a/python/packages/officeconvert/src/officeconvert/conversion.py b/python/packages/officeconvert/src/officeconvert/conversion.py index b9ea7b0..16649fc 100644 --- a/python/packages/officeconvert/src/officeconvert/conversion.py +++ b/python/packages/officeconvert/src/officeconvert/conversion.py @@ -2,6 +2,7 @@ from __future__ import annotations +from collections.abc import Callable from dataclasses import dataclass from pathlib import Path import subprocess @@ -27,6 +28,14 @@ class SlideDeckResult: slides: list[SlideArtifact] +ProgressCallback = Callable[[str, int, int], None] +PageProgressCallback = Callable[[int, int], None] + +PHASE_EXTRACTING_NOTES = "extracting_notes" +PHASE_PPTX_TO_PDF = "pptx_to_pdf" +PHASE_PDF_TO_IMAGES = "pdf_to_images" + + def convert_pptx_to_pdf(pptx_path: Path, pdf_path: Path, *, timeout_s: int = 120) -> Path: """Convert a PPTX file to PDF using headless LibreOffice. @@ -92,6 +101,8 @@ def render_pdf_to_images( dpi: int = 180, image_format: str = "png", timeout_s: int = 120, + total_pages: int | None = None, + page_progress_callback: PageProgressCallback | None = None, ) -> list[Path]: """Render each PDF page into an image using Poppler's `pdftoppm`. @@ -113,35 +124,83 @@ def render_pdf_to_images( raise FileNotFoundError(f"source PDF does not exist: {pdf_path}") out_dir.mkdir(parents=True, exist_ok=True) - prefix_path = out_dir / "slide" - command = [ - "pdftoppm", - "-r", - str(dpi), - f"-{image_format}", - str(pdf_path.resolve()), - str(prefix_path), - ] - try: - completed = subprocess.run( - command, - check=False, - capture_output=True, - text=True, - timeout=timeout_s, - ) - except subprocess.TimeoutExpired as exc: - raise RuntimeError( - "Poppler rasterization timed out after " - f"{timeout_s} seconds while rendering {pdf_path.name}; " - "increase conversion PDF render timeout or lower image DPI" - ) from exc - if completed.returncode != 0: - raise RuntimeError( - f"Poppler rasterization failed: {completed.stderr.strip() or completed.stdout.strip()}" - ) + if total_pages is None: + prefix_path = out_dir / "slide" + command = [ + "pdftoppm", + "-r", + str(dpi), + f"-{image_format}", + str(pdf_path.resolve()), + str(prefix_path), + ] + try: + completed = subprocess.run( + command, + check=False, + capture_output=True, + text=True, + timeout=timeout_s, + ) + except subprocess.TimeoutExpired as exc: + raise RuntimeError( + "Poppler rasterization timed out after " + f"{timeout_s} seconds while rendering {pdf_path.name}; " + "increase conversion PDF render timeout or lower image DPI" + ) from exc + if completed.returncode != 0: + raise RuntimeError( + f"Poppler rasterization failed: {completed.stderr.strip() or completed.stdout.strip()}" + ) + images = sorted(out_dir.glob(f"slide-*.{image_format}")) + else: + if total_pages < 0: + raise ValueError("total_pages must be zero or greater") + images = [] + for page_index in range(1, total_pages + 1): + page_prefix = out_dir / f"slide-{page_index:04d}" + command = [ + "pdftoppm", + "-r", + str(dpi), + f"-{image_format}", + "-f", + str(page_index), + "-l", + str(page_index), + "-singlefile", + str(pdf_path.resolve()), + str(page_prefix), + ] + try: + completed = subprocess.run( + command, + check=False, + capture_output=True, + text=True, + timeout=timeout_s, + ) + except subprocess.TimeoutExpired as exc: + raise RuntimeError( + "Poppler rasterization timed out after " + f"{timeout_s} seconds while rendering page {page_index} " + f"of {pdf_path.name}; increase conversion PDF render timeout " + "or lower image DPI" + ) from exc + if completed.returncode != 0: + raise RuntimeError( + "Poppler rasterization failed on page " + f"{page_index}: {completed.stderr.strip() or completed.stdout.strip()}" + ) + image_path = page_prefix.with_suffix(f".{image_format}") + if not image_path.exists(): + raise RuntimeError( + f"Poppler did not create expected page image: {image_path}" + ) + images.append(image_path.resolve()) + if page_progress_callback is not None: + page_progress_callback(page_index, total_pages) - images = sorted(out_dir.glob(f"slide-*.{image_format}")) if not images: raise RuntimeError(f"no rendered images found in {out_dir}") return [image.resolve() for image in images] @@ -180,6 +239,11 @@ def convert_pptx_to_slidedeck( image_format: str = "png", pptx_to_pdf_timeout_s: int = 180, pdf_to_images_timeout_s: int = 600, + pptx_to_pdf_base_timeout_s: int = 45, + pptx_to_pdf_per_slide_timeout_s: int = 3, + pdf_to_images_base_timeout_s: int = 30, + pdf_to_images_per_slide_timeout_s: int = 8, + progress_callback: ProgressCallback | None = None, ) -> SlideDeckResult: """Convert a PPTX into rendered images and extracted notes. @@ -206,15 +270,45 @@ def convert_pptx_to_slidedeck( pdf_path = work_dir / f"{pptx_path.stem}.pdf" image_dir = work_dir / "slides" - convert_pptx_to_pdf(pptx_path, pdf_path, timeout_s=pptx_to_pdf_timeout_s) + _emit_progress(progress_callback, PHASE_EXTRACTING_NOTES, 0, 1) + notes = extract_slide_notes(pptx_path) + _emit_progress(progress_callback, PHASE_EXTRACTING_NOTES, 1, 1) + slide_count = len(notes) + pptx_to_pdf_timeout = _compute_adaptive_timeout( + slide_count=slide_count, + timeout_cap_s=pptx_to_pdf_timeout_s, + base_timeout_s=pptx_to_pdf_base_timeout_s, + per_slide_timeout_s=pptx_to_pdf_per_slide_timeout_s, + ) + pdf_to_images_timeout = _compute_adaptive_timeout( + slide_count=slide_count, + timeout_cap_s=pdf_to_images_timeout_s, + base_timeout_s=pdf_to_images_base_timeout_s, + per_slide_timeout_s=pdf_to_images_per_slide_timeout_s, + ) + + _emit_progress(progress_callback, PHASE_PPTX_TO_PDF, 0, 1) + convert_pptx_to_pdf(pptx_path, pdf_path, timeout_s=pptx_to_pdf_timeout) + _emit_progress(progress_callback, PHASE_PPTX_TO_PDF, 1, 1) + + _emit_progress(progress_callback, PHASE_PDF_TO_IMAGES, 0, slide_count) image_paths = render_pdf_to_images( pdf_path, image_dir, dpi=dpi, image_format=image_format, - timeout_s=pdf_to_images_timeout_s, + timeout_s=_compute_page_timeout( + total_timeout_s=pdf_to_images_timeout, + page_count=slide_count, + ), + total_pages=slide_count, + page_progress_callback=lambda current, max_pages: _emit_progress( + progress_callback, + PHASE_PDF_TO_IMAGES, + current, + max_pages, + ), ) - notes = extract_slide_notes(pptx_path) if len(image_paths) != len(notes): raise ValueError( @@ -229,6 +323,40 @@ def convert_pptx_to_slidedeck( return SlideDeckResult(source_filename=pptx_path.name, slides=slides) +def _compute_adaptive_timeout( + *, + slide_count: int, + timeout_cap_s: int, + base_timeout_s: int, + per_slide_timeout_s: int, +) -> int: + """Compute a bounded timeout that scales linearly with slide count.""" + normalized_slides = max(1, slide_count) + adaptive_timeout = base_timeout_s + (normalized_slides * per_slide_timeout_s) + bounded_timeout = min(timeout_cap_s, adaptive_timeout) + return max(1, bounded_timeout) + + +def _compute_page_timeout(*, total_timeout_s: int, page_count: int) -> int: + """Split total PDF raster timeout into a bounded per-page timeout.""" + if page_count <= 0: + return max(1, total_timeout_s) + timeout = (total_timeout_s + page_count - 1) // page_count + return max(15, timeout) + + +def _emit_progress( + progress_callback: ProgressCallback | None, + phase: str, + current_progress: int, + max_progress: int, +) -> None: + """Emit phase/progress updates when a callback is configured.""" + if progress_callback is None: + return + progress_callback(phase, current_progress, max_progress) + + def _extract_notes_text(shapes: Iterable[object]) -> str: """Extract plain text from note shapes while preserving paragraph breaks.""" segments: list[str] = [] diff --git a/python/packages/server/src/officeconvert_server/config.py b/python/packages/server/src/officeconvert_server/config.py index 6cc3cc1..5d4eb52 100644 --- a/python/packages/server/src/officeconvert_server/config.py +++ b/python/packages/server/src/officeconvert_server/config.py @@ -19,6 +19,10 @@ class ServerConfig: conversion_image_dpi: int conversion_pptx_to_pdf_timeout_seconds: int conversion_pdf_to_images_timeout_seconds: int + conversion_pptx_to_pdf_base_timeout_seconds: int + conversion_pptx_to_pdf_per_slide_timeout_seconds: int + conversion_pdf_to_images_base_timeout_seconds: int + conversion_pdf_to_images_per_slide_timeout_seconds: int conversion_cleanup_delay_seconds: int @@ -31,13 +35,25 @@ def load_server_config() -> ServerConfig: s3_secure=os.getenv("S3_USE_SSL", "false").lower() == "true", s3_public_endpoint=os.getenv("S3_PUBLIC_ENDPOINT", "localhost:8333"), s3_session_ttl_seconds=int(os.getenv("S3_SESSION_TTL_SECONDS", "3600")), - conversion_image_dpi=int(os.getenv("CONVERSION_IMAGE_DPI", "150")), + conversion_image_dpi=int(os.getenv("CONVERSION_IMAGE_DPI", "72")), conversion_pptx_to_pdf_timeout_seconds=int( os.getenv("CONVERSION_PPTX_TO_PDF_TIMEOUT_SECONDS", "180") ), conversion_pdf_to_images_timeout_seconds=int( os.getenv("CONVERSION_PDF_TO_IMAGES_TIMEOUT_SECONDS", "600") ), + conversion_pptx_to_pdf_base_timeout_seconds=int( + os.getenv("CONVERSION_PPTX_TO_PDF_BASE_TIMEOUT_SECONDS", "45") + ), + conversion_pptx_to_pdf_per_slide_timeout_seconds=int( + os.getenv("CONVERSION_PPTX_TO_PDF_PER_SLIDE_TIMEOUT_SECONDS", "3") + ), + conversion_pdf_to_images_base_timeout_seconds=int( + os.getenv("CONVERSION_PDF_TO_IMAGES_BASE_TIMEOUT_SECONDS", "30") + ), + conversion_pdf_to_images_per_slide_timeout_seconds=int( + os.getenv("CONVERSION_PDF_TO_IMAGES_PER_SLIDE_TIMEOUT_SECONDS", "8") + ), conversion_cleanup_delay_seconds=int( os.getenv("CONVERSION_CLEANUP_DELAY_SECONDS", "3600") ), diff --git a/python/packages/server/src/officeconvert_server/models.py b/python/packages/server/src/officeconvert_server/models.py index e73b487..32b9946 100644 --- a/python/packages/server/src/officeconvert_server/models.py +++ b/python/packages/server/src/officeconvert_server/models.py @@ -22,6 +22,9 @@ class ConversionSession: bucket_name: str upload_object_key: str status: conversion_pb2.ConversionStatus + phase: conversion_pb2.ConversionPhase = conversion_pb2.CONVERSION_PHASE_INACTIVE + current_progress: int = 0 + max_progress: int = 0 created_at: datetime = field(default_factory=utc_now) updated_at: datetime = field(default_factory=utc_now) error_message: str = "" diff --git a/python/packages/server/src/officeconvert_server/service.py b/python/packages/server/src/officeconvert_server/service.py index ded2403..80b8fcb 100644 --- a/python/packages/server/src/officeconvert_server/service.py +++ b/python/packages/server/src/officeconvert_server/service.py @@ -3,6 +3,7 @@ from __future__ import annotations import asyncio +from collections.abc import Callable from datetime import datetime, timedelta, timezone from pathlib import Path import shutil @@ -14,6 +15,11 @@ from connectrpc.errors import ConnectError from connectrpc.request import RequestContext from google.protobuf.timestamp_pb2 import Timestamp from officeconvert import SlideArtifact, convert_pptx_to_slidedeck +from officeconvert.conversion import ( + PHASE_EXTRACTING_NOTES, + PHASE_PDF_TO_IMAGES, + PHASE_PPTX_TO_PDF, +) from officeconvertapi.v1 import conversion_connect, conversion_pb2 from officeconvert_server.config import ServerConfig @@ -98,6 +104,10 @@ class ConversionServiceImpl(conversion_connect.ConversionService): ) session.status = conversion_pb2.CONVERSION_STATUS_RUNNING + session.phase = conversion_pb2.CONVERSION_PHASE_INACTIVE + session.current_progress = 0 + session.max_progress = 0 + session.error_message = "" session.updated_at = utc_now() session.conversion_task = asyncio.create_task(self._run_conversion(session)) @@ -119,6 +129,9 @@ class ConversionServiceImpl(conversion_connect.ConversionService): status=session.status, error_message=session.error_message, updated_at=_to_timestamp(session.updated_at), + phase=session.phase, + current_progress=session.current_progress, + max_progress=session.max_progress, ) async def get_slide_deck( @@ -185,22 +198,47 @@ class ConversionServiceImpl(conversion_connect.ConversionService): dpi=self._config.conversion_image_dpi, pptx_to_pdf_timeout_s=self._config.conversion_pptx_to_pdf_timeout_seconds, pdf_to_images_timeout_s=self._config.conversion_pdf_to_images_timeout_seconds, + pptx_to_pdf_base_timeout_s=self._config.conversion_pptx_to_pdf_base_timeout_seconds, + pptx_to_pdf_per_slide_timeout_s=self._config.conversion_pptx_to_pdf_per_slide_timeout_seconds, + pdf_to_images_base_timeout_s=self._config.conversion_pdf_to_images_base_timeout_seconds, + pdf_to_images_per_slide_timeout_s=self._config.conversion_pdf_to_images_per_slide_timeout_seconds, + progress_callback=lambda phase_name, current, max_value: self._set_session_progress_from_name( + session, + phase_name=phase_name, + current_progress=current, + max_progress=max_value, + ), + ) + self._set_session_progress( + session, + phase=conversion_pb2.CONVERSION_PHASE_UPLOADING_RESULTS, + current_progress=0, + max_progress=len(result.slides), ) session.slide_deck = await asyncio.to_thread( self._upload_and_build_slide_deck, session, result.slides, result.source_filename, + lambda current, max_value: self._set_session_progress( + session, + phase=conversion_pb2.CONVERSION_PHASE_UPLOADING_RESULTS, + current_progress=current, + max_progress=max_value, + ), ) session.status = conversion_pb2.CONVERSION_STATUS_SUCCEEDED + session.phase = conversion_pb2.CONVERSION_PHASE_INACTIVE session.updated_at = utc_now() except asyncio.CancelledError: session.status = conversion_pb2.CONVERSION_STATUS_FAILED + session.phase = conversion_pb2.CONVERSION_PHASE_INACTIVE session.error_message = "conversion cancelled" session.updated_at = utc_now() raise except Exception as exc: session.status = conversion_pb2.CONVERSION_STATUS_FAILED + session.phase = conversion_pb2.CONVERSION_PHASE_INACTIVE session.error_message = str(exc) session.updated_at = utc_now() finally: @@ -212,10 +250,12 @@ class ConversionServiceImpl(conversion_connect.ConversionService): session: ConversionSession, slides: list[SlideArtifact], source_filename: str, + progress_callback: Callable[[int, int], None] | None = None, ) -> conversion_pb2.SlideDeck: """Upload generated slide images and construct API response payload.""" response_slides: list[conversion_pb2.Slide] = [] - for slide in slides: + slide_total = len(slides) + for slide_index, slide in enumerate(slides, start=1): object_key = f"output/slide-{slide.index:04d}{slide.image_path.suffix}" self._store.fput_object(session.bucket_name, object_key, slide.image_path) image_url = self._store.presigned_get_url( @@ -230,6 +270,8 @@ class ConversionServiceImpl(conversion_connect.ConversionService): image_url=image_url, ) ) + if progress_callback is not None: + progress_callback(slide_index, slide_total) return conversion_pb2.SlideDeck( conversion_id=session.conversion_id, @@ -263,6 +305,45 @@ class ConversionServiceImpl(conversion_connect.ConversionService): raise ConnectError(Code.NOT_FOUND, "conversion_id not found") return session + def _set_session_progress_from_name( + self, + session: ConversionSession, + *, + phase_name: str, + current_progress: int, + max_progress: int, + ) -> None: + """Map conversion-library phase names onto API enum phases.""" + phase_map = { + PHASE_EXTRACTING_NOTES: conversion_pb2.CONVERSION_PHASE_EXTRACTING_NOTES, + PHASE_PPTX_TO_PDF: conversion_pb2.CONVERSION_PHASE_PPTX_TO_PDF, + PHASE_PDF_TO_IMAGES: conversion_pb2.CONVERSION_PHASE_PDF_TO_IMAGES, + } + self._set_session_progress( + session, + phase=phase_map.get(phase_name, conversion_pb2.CONVERSION_PHASE_INACTIVE), + current_progress=current_progress, + max_progress=max_progress, + ) + + def _set_session_progress( + self, + session: ConversionSession, + *, + phase: conversion_pb2.ConversionPhase, + current_progress: int, + max_progress: int, + ) -> None: + """Set normalized phase/progress counters and touch update timestamp.""" + normalized_max = max(0, max_progress) + normalized_current = max(0, current_progress) + if normalized_max > 0: + normalized_current = min(normalized_current, normalized_max) + session.phase = phase + session.current_progress = normalized_current + session.max_progress = normalized_max + session.updated_at = utc_now() + def _to_timestamp(value: datetime) -> Timestamp: """Convert a timezone-aware datetime to protobuf Timestamp."""