add rich output support for slide notes
Docker server image / build-and-push (push) Successful in 3m2s
Docker server image / build-and-push (push) Successful in 3m2s
This commit is contained in:
@@ -1,19 +1,27 @@
|
||||
"""Public conversion APIs for the officeconvert Python library."""
|
||||
|
||||
from officeconvert.conversion import (
|
||||
HtmlFormattingPolicy,
|
||||
NotesFormat,
|
||||
NotesOptions,
|
||||
SlideArtifact,
|
||||
SlideDeckResult,
|
||||
convert_pptx_to_pdf,
|
||||
convert_pptx_to_slidedeck,
|
||||
extract_slide_notes,
|
||||
extract_slide_notes_html,
|
||||
render_pdf_to_images,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"HtmlFormattingPolicy",
|
||||
"NotesFormat",
|
||||
"NotesOptions",
|
||||
"SlideArtifact",
|
||||
"SlideDeckResult",
|
||||
"convert_pptx_to_pdf",
|
||||
"convert_pptx_to_slidedeck",
|
||||
"extract_slide_notes",
|
||||
"extract_slide_notes_html",
|
||||
"render_pdf_to_images",
|
||||
]
|
||||
|
||||
@@ -4,6 +4,8 @@ from __future__ import annotations
|
||||
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
import html
|
||||
import logging
|
||||
import math
|
||||
from pathlib import Path
|
||||
@@ -13,6 +15,28 @@ from typing import Iterable
|
||||
from pptx import Presentation
|
||||
|
||||
|
||||
class NotesFormat(str, Enum):
|
||||
PLAIN = "plain"
|
||||
HTML = "html"
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class HtmlFormattingPolicy:
|
||||
ignore_bold: bool = False
|
||||
ignore_italic: bool = False
|
||||
ignore_underline: bool = False
|
||||
ignore_strikethrough: bool = False
|
||||
ignore_font_size: bool = False
|
||||
ignore_color: bool = False
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class NotesOptions:
|
||||
format: NotesFormat = NotesFormat.PLAIN
|
||||
html_use_paragraph_tags: bool = True
|
||||
html_policy: HtmlFormattingPolicy = HtmlFormattingPolicy()
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class SlideArtifact:
|
||||
"""Represents one converted slide image and its extracted notes."""
|
||||
@@ -21,6 +45,7 @@ class SlideArtifact:
|
||||
image_path: Path
|
||||
thumbnail_path: Path
|
||||
notes_plain: str
|
||||
notes_html: str = ""
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
@@ -290,14 +315,52 @@ def extract_slide_notes(pptx_path: Path) -> list[str]:
|
||||
if not pptx_path.exists():
|
||||
raise FileNotFoundError(f"source PPTX does not exist: {pptx_path}")
|
||||
|
||||
plain, _html = _extract_slide_notes(pptx_path, options=NotesOptions(format=NotesFormat.PLAIN))
|
||||
return plain
|
||||
|
||||
|
||||
def extract_slide_notes_html(
|
||||
pptx_path: Path,
|
||||
*,
|
||||
options: NotesOptions | None = None,
|
||||
) -> list[str]:
|
||||
"""Extract sanitized HTML notes for each slide in slide index order.
|
||||
|
||||
The returned HTML is sanitized-by-construction:
|
||||
- text content is always escaped
|
||||
- only a small allowlist of tags is emitted: p, br, strong, em, u, s, span
|
||||
- only `style` attributes generated by this function are emitted on span tags
|
||||
"""
|
||||
if not pptx_path.exists():
|
||||
raise FileNotFoundError(f"source PPTX does not exist: {pptx_path}")
|
||||
resolved = options or NotesOptions(format=NotesFormat.HTML)
|
||||
if resolved.format != NotesFormat.HTML:
|
||||
raise ValueError("extract_slide_notes_html requires NotesOptions.format=NotesFormat.HTML")
|
||||
|
||||
_plain, html_notes = _extract_slide_notes(pptx_path, options=resolved)
|
||||
return html_notes
|
||||
|
||||
|
||||
def _extract_slide_notes(
|
||||
pptx_path: Path,
|
||||
*,
|
||||
options: NotesOptions,
|
||||
) -> tuple[list[str], list[str]]:
|
||||
presentation = Presentation(str(pptx_path.resolve()))
|
||||
notes: list[str] = []
|
||||
want_html = options.format == NotesFormat.HTML
|
||||
|
||||
plain_notes: list[str] = []
|
||||
html_notes: list[str] = []
|
||||
for slide in presentation.slides:
|
||||
if not slide.has_notes_slide:
|
||||
notes.append("")
|
||||
plain_notes.append("")
|
||||
html_notes.append("")
|
||||
continue
|
||||
notes.append(_extract_notes_text(slide.notes_slide.shapes))
|
||||
return notes
|
||||
plain = _extract_notes_text(slide.notes_slide.shapes)
|
||||
plain_notes.append(plain)
|
||||
html_notes.append(_extract_notes_html(slide.notes_slide.shapes, options=options) if want_html else "")
|
||||
|
||||
return plain_notes, html_notes
|
||||
|
||||
|
||||
def convert_pptx_to_slidedeck(
|
||||
@@ -314,6 +377,7 @@ def convert_pptx_to_slidedeck(
|
||||
pptx_to_pdf_per_slide_timeout_s: int = 3,
|
||||
pdf_to_images_base_timeout_s: int = 30,
|
||||
pdf_to_images_per_slide_timeout_s: int = 8,
|
||||
notes_options: NotesOptions | None = None,
|
||||
progress_callback: ProgressCallback | None = None,
|
||||
) -> SlideDeckResult:
|
||||
"""Convert a PPTX into rendered images and extracted notes.
|
||||
@@ -344,8 +408,9 @@ def convert_pptx_to_slidedeck(
|
||||
full_image_dir = work_dir / "slides_full"
|
||||
thumbnail_image_dir = work_dir / "slides_thumb"
|
||||
|
||||
resolved_notes_options = notes_options or NotesOptions(format=NotesFormat.PLAIN)
|
||||
_emit_progress(progress_callback, PHASE_EXTRACTING_NOTES, 0, 1)
|
||||
notes = extract_slide_notes(pptx_path)
|
||||
notes_plain, notes_html = _extract_slide_notes(pptx_path, options=resolved_notes_options)
|
||||
_emit_progress(progress_callback, PHASE_EXTRACTING_NOTES, 1, 1)
|
||||
slide_width, slide_height = _read_slide_size_emu(pptx_path)
|
||||
output_width, output_height = _infer_output_dimensions_from_slide_size(
|
||||
@@ -370,7 +435,7 @@ def convert_pptx_to_slidedeck(
|
||||
output_width_px=thumbnail_width,
|
||||
output_height_px=thumbnail_height,
|
||||
)
|
||||
slide_count = len(notes)
|
||||
slide_count = len(notes_plain)
|
||||
pptx_to_pdf_timeout = _compute_adaptive_timeout(
|
||||
slide_count=slide_count,
|
||||
timeout_cap_s=pptx_to_pdf_timeout_s,
|
||||
@@ -454,15 +519,15 @@ def convert_pptx_to_slidedeck(
|
||||
),
|
||||
)
|
||||
|
||||
if len(full_image_paths) != len(notes):
|
||||
if len(full_image_paths) != len(notes_plain):
|
||||
raise ValueError(
|
||||
"rendered full-size slide count does not match note count: "
|
||||
f"{len(full_image_paths)} image(s) vs {len(notes)} note entries"
|
||||
f"{len(full_image_paths)} image(s) vs {len(notes_plain)} note entries"
|
||||
)
|
||||
if len(thumbnail_image_paths) != len(notes):
|
||||
if len(thumbnail_image_paths) != len(notes_plain):
|
||||
raise ValueError(
|
||||
"rendered thumbnail slide count does not match note count: "
|
||||
f"{len(thumbnail_image_paths)} image(s) vs {len(notes)} note entries"
|
||||
f"{len(thumbnail_image_paths)} image(s) vs {len(notes_plain)} note entries"
|
||||
)
|
||||
|
||||
slides = [
|
||||
@@ -470,10 +535,11 @@ def convert_pptx_to_slidedeck(
|
||||
index=index,
|
||||
image_path=image_path,
|
||||
thumbnail_path=thumbnail_path,
|
||||
notes_plain=note,
|
||||
notes_plain=plain,
|
||||
notes_html=html,
|
||||
)
|
||||
for index, (image_path, thumbnail_path, note) in enumerate(
|
||||
zip(full_image_paths, thumbnail_image_paths, notes),
|
||||
for index, (image_path, thumbnail_path, plain, html) in enumerate(
|
||||
zip(full_image_paths, thumbnail_image_paths, notes_plain, notes_html),
|
||||
start=1,
|
||||
)
|
||||
]
|
||||
@@ -632,3 +698,102 @@ def _extract_notes_text(shapes: Iterable[object]) -> str:
|
||||
if text:
|
||||
segments.append(text)
|
||||
return "\n\n".join(segments).strip()
|
||||
|
||||
|
||||
def _extract_notes_html(shapes: Iterable[object], *, options: NotesOptions) -> str:
|
||||
"""Extract sanitized HTML from note shapes while preserving paragraph boundaries."""
|
||||
paragraphs_html: list[str] = []
|
||||
for shape in shapes:
|
||||
text_frame = getattr(shape, "text_frame", None)
|
||||
if text_frame is None:
|
||||
continue
|
||||
for paragraph in getattr(text_frame, "paragraphs", []) or []:
|
||||
paragraph_html = _paragraph_to_html(paragraph, options=options)
|
||||
if paragraph_html:
|
||||
paragraphs_html.append(paragraph_html)
|
||||
|
||||
if not paragraphs_html:
|
||||
return ""
|
||||
if options.html_use_paragraph_tags:
|
||||
return "".join(paragraphs_html)
|
||||
# Flatten paragraph boundaries into <br/> separators (double-break between paragraphs).
|
||||
return "<br/><br/>".join(
|
||||
p.removeprefix("<p>").removesuffix("</p>") if p.startswith("<p>") else p
|
||||
for p in paragraphs_html
|
||||
)
|
||||
|
||||
|
||||
def _paragraph_to_html(paragraph: object, *, options: NotesOptions) -> str:
|
||||
policy = options.html_policy
|
||||
|
||||
parts: list[str] = []
|
||||
for run in getattr(paragraph, "runs", []) or []:
|
||||
text = getattr(run, "text", "") or ""
|
||||
if not text:
|
||||
continue
|
||||
# Escape first, then re-introduce only allowlisted tags.
|
||||
escaped = html.escape(text, quote=False)
|
||||
escaped = escaped.replace("\n", "<br/>")
|
||||
|
||||
font = getattr(run, "font", None)
|
||||
style = ""
|
||||
if font is not None:
|
||||
if not policy.ignore_color:
|
||||
rgb_obj = getattr(getattr(font, "color", None), "rgb", None)
|
||||
if rgb_obj is not None:
|
||||
rgb = str(rgb_obj)
|
||||
if isinstance(rgb, str) and len(rgb) == 6:
|
||||
style += f"color: #{rgb};"
|
||||
if not policy.ignore_font_size:
|
||||
size = getattr(font, "size", None)
|
||||
pt = getattr(size, "pt", None)
|
||||
if isinstance(pt, (int, float)):
|
||||
# Clamp to a sane range to avoid pathological CSS.
|
||||
if 0.5 <= pt <= 512:
|
||||
pt_str = str(int(pt)) if float(pt).is_integer() else str(round(float(pt), 2))
|
||||
style += f"font-size: {pt_str}pt;"
|
||||
|
||||
content = escaped
|
||||
if style:
|
||||
content = f"<span style=\"{style}\">{content}</span>"
|
||||
|
||||
if font is not None:
|
||||
if not policy.ignore_bold and getattr(font, "bold", None) is True:
|
||||
content = f"<strong>{content}</strong>"
|
||||
if not policy.ignore_italic and getattr(font, "italic", None) is True:
|
||||
content = f"<em>{content}</em>"
|
||||
if not policy.ignore_underline and _truthy_underline(getattr(font, "underline", None)):
|
||||
content = f"<u>{content}</u>"
|
||||
if not policy.ignore_strikethrough and _truthy_strikethrough(run):
|
||||
content = f"<s>{content}</s>"
|
||||
|
||||
parts.append(content)
|
||||
|
||||
inner = "".join(parts).strip()
|
||||
if not inner:
|
||||
return ""
|
||||
if options.html_use_paragraph_tags:
|
||||
return f"<p>{inner}</p>"
|
||||
return inner
|
||||
|
||||
|
||||
def _truthy_underline(value: object) -> bool:
|
||||
# python-pptx may represent underline as True/False/None or an enum value.
|
||||
if value is True:
|
||||
return True
|
||||
if value in (None, False):
|
||||
return False
|
||||
# Any non-falsey, non-None value implies underline style.
|
||||
return True
|
||||
|
||||
|
||||
def _truthy_strikethrough(run: object) -> bool:
|
||||
# python-pptx doesn't currently expose a first-class Font.strike property; detect via XML.
|
||||
xml_run = getattr(run, "_r", None)
|
||||
rpr = getattr(xml_run, "rPr", None)
|
||||
if rpr is None:
|
||||
return False
|
||||
strike_val = rpr.get("strike")
|
||||
if strike_val in (None, "noStrike", "false", "0"):
|
||||
return False
|
||||
return True
|
||||
|
||||
@@ -0,0 +1,111 @@
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from pptx import Presentation
|
||||
from pptx.dml.color import RGBColor
|
||||
from pptx.util import Pt
|
||||
|
||||
from officeconvert.conversion import HtmlFormattingPolicy, NotesFormat, NotesOptions, extract_slide_notes_html
|
||||
|
||||
|
||||
def _build_pptx_with_formatted_notes(tmp_path: Path) -> Path:
|
||||
prs = Presentation()
|
||||
slide = prs.slides.add_slide(prs.slide_layouts[5]) # blank
|
||||
|
||||
notes = slide.notes_slide
|
||||
tf = notes.notes_text_frame
|
||||
|
||||
# First paragraph: mixed runs with formatting.
|
||||
p1 = tf.paragraphs[0]
|
||||
p1.text = ""
|
||||
r1 = p1.add_run()
|
||||
r1.text = "BoldRed24 "
|
||||
r1.font.bold = True
|
||||
r1.font.color.rgb = RGBColor(0xFF, 0x00, 0x00)
|
||||
r1.font.size = Pt(24)
|
||||
|
||||
r2 = p1.add_run()
|
||||
r2.text = "UnderStrike"
|
||||
r2.font.underline = True
|
||||
# python-pptx does not currently expose a first-class API for strikethrough,
|
||||
# but the underlying DrawingML uses a:rPr@strike="sngStrike" or "dblStrike".
|
||||
r2._r.rPr.set("strike", "sngStrike")
|
||||
|
||||
# Second paragraph: ensure escaping is applied.
|
||||
p2 = tf.add_paragraph()
|
||||
r3 = p2.add_run()
|
||||
r3.text = "<script>alert(1)</script>"
|
||||
r3.font.italic = True
|
||||
|
||||
out = tmp_path / "notes.pptx"
|
||||
prs.save(out)
|
||||
return out
|
||||
|
||||
|
||||
class NotesHtmlExtractionTests(unittest.TestCase):
|
||||
def test_extracts_basic_rich_text_and_escapes(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pptx_path = _build_pptx_with_formatted_notes(Path(d))
|
||||
html_notes = extract_slide_notes_html(
|
||||
pptx_path,
|
||||
options=NotesOptions(format=NotesFormat.HTML, html_use_paragraph_tags=True),
|
||||
)
|
||||
|
||||
self.assertEqual(len(html_notes), 1)
|
||||
html_out = html_notes[0]
|
||||
|
||||
# Paragraphs are wrapped in <p>.
|
||||
self.assertIn("<p>", html_out)
|
||||
self.assertIn("</p>", html_out)
|
||||
|
||||
# Formatting tags.
|
||||
self.assertIn("<strong>", html_out)
|
||||
self.assertIn("<em>", html_out)
|
||||
self.assertIn("<u>", html_out)
|
||||
self.assertIn("<s>", html_out)
|
||||
|
||||
# Style tags for RGB and font size.
|
||||
self.assertIn('style="color: #FF0000;font-size: 24pt;"', html_out)
|
||||
|
||||
# Escaping: no raw <script> tag survives.
|
||||
self.assertNotIn("<script>", html_out)
|
||||
self.assertIn("<script>alert(1)</script>", html_out)
|
||||
|
||||
def test_ignore_policy_prunes_styles_and_tags(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pptx_path = _build_pptx_with_formatted_notes(Path(d))
|
||||
html_notes = extract_slide_notes_html(
|
||||
pptx_path,
|
||||
options=NotesOptions(
|
||||
format=NotesFormat.HTML,
|
||||
html_use_paragraph_tags=True,
|
||||
html_policy=HtmlFormattingPolicy(
|
||||
ignore_bold=True,
|
||||
ignore_color=True,
|
||||
ignore_font_size=True,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
html_out = html_notes[0]
|
||||
self.assertNotIn("<strong>", html_out)
|
||||
self.assertNotIn("color:", html_out)
|
||||
self.assertNotIn("font-size:", html_out)
|
||||
|
||||
def test_br_mode_uses_double_break_between_paragraphs(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pptx_path = _build_pptx_with_formatted_notes(Path(d))
|
||||
html_notes = extract_slide_notes_html(
|
||||
pptx_path,
|
||||
options=NotesOptions(format=NotesFormat.HTML, html_use_paragraph_tags=False),
|
||||
)
|
||||
|
||||
html_out = html_notes[0]
|
||||
self.assertNotIn("<p>", html_out)
|
||||
self.assertIn("<br/><br/>", html_out)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -23,6 +23,7 @@ class ConversionSession:
|
||||
thumbnail_resolution: conversion_pb2.ConversionResolution
|
||||
full_jpeg_quality: int
|
||||
thumbnail_jpeg_quality: int
|
||||
notes: conversion_pb2.NotesOptions | None = None
|
||||
bucket_name: str
|
||||
upload_object_key: str
|
||||
status: conversion_pb2.ConversionStatus
|
||||
|
||||
@@ -18,6 +18,9 @@ from google.protobuf.timestamp_pb2 import Timestamp
|
||||
from officeconvert import SlideArtifact, convert_pptx_to_slidedeck
|
||||
from officeconvert.conversion import (
|
||||
ConversionTimeoutError,
|
||||
HtmlFormattingPolicy,
|
||||
NotesFormat,
|
||||
NotesOptions,
|
||||
PHASE_EXTRACTING_NOTES,
|
||||
PHASE_PDF_TO_IMAGES,
|
||||
PHASE_PPTX_TO_PDF,
|
||||
@@ -49,6 +52,37 @@ _DEFAULT_FULL_RESOLUTION = conversion_pb2.CONVERSION_RESOLUTION_FHD
|
||||
_DEFAULT_THUMBNAIL_RESOLUTION = conversion_pb2.CONVERSION_RESOLUTION_SD
|
||||
_DEFAULT_FULL_JPEG_QUALITY = 85
|
||||
_DEFAULT_THUMBNAIL_JPEG_QUALITY = 75
|
||||
_DEFAULT_NOTES_FORMAT = conversion_pb2.NOTES_FORMAT_PLAIN
|
||||
_DEFAULT_HTML_USE_PARAGRAPH_TAGS = True
|
||||
|
||||
|
||||
def _to_library_notes_options(
|
||||
notes: conversion_pb2.NotesOptions | None,
|
||||
) -> NotesOptions | None:
|
||||
if notes is None:
|
||||
return None
|
||||
|
||||
fmt = notes.format or _DEFAULT_NOTES_FORMAT
|
||||
library_format = NotesFormat.HTML if fmt == conversion_pb2.NOTES_FORMAT_HTML else NotesFormat.PLAIN
|
||||
|
||||
html_use_paragraph_tags = _DEFAULT_HTML_USE_PARAGRAPH_TAGS
|
||||
if notes.HasField("html_use_paragraph_tags"):
|
||||
html_use_paragraph_tags = bool(notes.html_use_paragraph_tags)
|
||||
|
||||
policy_proto = notes.html_policy
|
||||
policy = HtmlFormattingPolicy(
|
||||
ignore_bold=bool(policy_proto.ignore_bold),
|
||||
ignore_italic=bool(policy_proto.ignore_italic),
|
||||
ignore_underline=bool(policy_proto.ignore_underline),
|
||||
ignore_strikethrough=bool(policy_proto.ignore_strikethrough),
|
||||
ignore_font_size=bool(policy_proto.ignore_font_size),
|
||||
ignore_color=bool(policy_proto.ignore_color),
|
||||
)
|
||||
return NotesOptions(
|
||||
format=library_format,
|
||||
html_use_paragraph_tags=html_use_paragraph_tags,
|
||||
html_policy=policy,
|
||||
)
|
||||
|
||||
|
||||
class ConversionServiceImpl(conversion_connect.ConversionService):
|
||||
@@ -124,6 +158,7 @@ class ConversionServiceImpl(conversion_connect.ConversionService):
|
||||
thumbnail_resolution=thumbnail_resolution,
|
||||
full_jpeg_quality=full_jpeg_quality,
|
||||
thumbnail_jpeg_quality=thumbnail_jpeg_quality,
|
||||
notes=request.notes if request.HasField("notes") else None,
|
||||
bucket_name=bucket_name,
|
||||
upload_object_key=upload_key,
|
||||
status=conversion_pb2.CONVERSION_STATUS_PENDING,
|
||||
@@ -280,6 +315,7 @@ class ConversionServiceImpl(conversion_connect.ConversionService):
|
||||
pptx_to_pdf_per_slide_timeout_s=self._config.conversion_pptx_to_pdf_per_slide_timeout_seconds,
|
||||
pdf_to_images_base_timeout_s=self._config.conversion_pdf_to_images_base_timeout_seconds,
|
||||
pdf_to_images_per_slide_timeout_s=self._config.conversion_pdf_to_images_per_slide_timeout_seconds,
|
||||
notes_options=_to_library_notes_options(session.notes),
|
||||
progress_callback=lambda phase_name, current, max_value: self._set_session_progress_from_name(
|
||||
session,
|
||||
phase_name=phase_name,
|
||||
@@ -428,6 +464,7 @@ class ConversionServiceImpl(conversion_connect.ConversionService):
|
||||
conversion_pb2.Slide(
|
||||
index=slide.index,
|
||||
notes_plain=slide.notes_plain,
|
||||
notes_html=slide.notes_html,
|
||||
image_url=image_url,
|
||||
thumbnail_image_url=thumbnail_image_url,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user