add rich output support for slide notes
Docker server image / build-and-push (push) Successful in 3m2s

This commit is contained in:
2026-05-07 10:35:37 -07:00
parent 500b767d58
commit 06d4122e4e
11 changed files with 831 additions and 140 deletions
@@ -1,19 +1,27 @@
"""Public conversion APIs for the officeconvert Python library."""
from officeconvert.conversion import (
HtmlFormattingPolicy,
NotesFormat,
NotesOptions,
SlideArtifact,
SlideDeckResult,
convert_pptx_to_pdf,
convert_pptx_to_slidedeck,
extract_slide_notes,
extract_slide_notes_html,
render_pdf_to_images,
)
__all__ = [
"HtmlFormattingPolicy",
"NotesFormat",
"NotesOptions",
"SlideArtifact",
"SlideDeckResult",
"convert_pptx_to_pdf",
"convert_pptx_to_slidedeck",
"extract_slide_notes",
"extract_slide_notes_html",
"render_pdf_to_images",
]
@@ -4,6 +4,8 @@ from __future__ import annotations
from collections.abc import Callable
from dataclasses import dataclass
from enum import Enum
import html
import logging
import math
from pathlib import Path
@@ -13,6 +15,28 @@ from typing import Iterable
from pptx import Presentation
class NotesFormat(str, Enum):
PLAIN = "plain"
HTML = "html"
@dataclass(frozen=True, slots=True)
class HtmlFormattingPolicy:
ignore_bold: bool = False
ignore_italic: bool = False
ignore_underline: bool = False
ignore_strikethrough: bool = False
ignore_font_size: bool = False
ignore_color: bool = False
@dataclass(frozen=True, slots=True)
class NotesOptions:
format: NotesFormat = NotesFormat.PLAIN
html_use_paragraph_tags: bool = True
html_policy: HtmlFormattingPolicy = HtmlFormattingPolicy()
@dataclass(frozen=True, slots=True)
class SlideArtifact:
"""Represents one converted slide image and its extracted notes."""
@@ -21,6 +45,7 @@ class SlideArtifact:
image_path: Path
thumbnail_path: Path
notes_plain: str
notes_html: str = ""
@dataclass(frozen=True, slots=True)
@@ -290,14 +315,52 @@ def extract_slide_notes(pptx_path: Path) -> list[str]:
if not pptx_path.exists():
raise FileNotFoundError(f"source PPTX does not exist: {pptx_path}")
plain, _html = _extract_slide_notes(pptx_path, options=NotesOptions(format=NotesFormat.PLAIN))
return plain
def extract_slide_notes_html(
pptx_path: Path,
*,
options: NotesOptions | None = None,
) -> list[str]:
"""Extract sanitized HTML notes for each slide in slide index order.
The returned HTML is sanitized-by-construction:
- text content is always escaped
- only a small allowlist of tags is emitted: p, br, strong, em, u, s, span
- only `style` attributes generated by this function are emitted on span tags
"""
if not pptx_path.exists():
raise FileNotFoundError(f"source PPTX does not exist: {pptx_path}")
resolved = options or NotesOptions(format=NotesFormat.HTML)
if resolved.format != NotesFormat.HTML:
raise ValueError("extract_slide_notes_html requires NotesOptions.format=NotesFormat.HTML")
_plain, html_notes = _extract_slide_notes(pptx_path, options=resolved)
return html_notes
def _extract_slide_notes(
pptx_path: Path,
*,
options: NotesOptions,
) -> tuple[list[str], list[str]]:
presentation = Presentation(str(pptx_path.resolve()))
notes: list[str] = []
want_html = options.format == NotesFormat.HTML
plain_notes: list[str] = []
html_notes: list[str] = []
for slide in presentation.slides:
if not slide.has_notes_slide:
notes.append("")
plain_notes.append("")
html_notes.append("")
continue
notes.append(_extract_notes_text(slide.notes_slide.shapes))
return notes
plain = _extract_notes_text(slide.notes_slide.shapes)
plain_notes.append(plain)
html_notes.append(_extract_notes_html(slide.notes_slide.shapes, options=options) if want_html else "")
return plain_notes, html_notes
def convert_pptx_to_slidedeck(
@@ -314,6 +377,7 @@ def convert_pptx_to_slidedeck(
pptx_to_pdf_per_slide_timeout_s: int = 3,
pdf_to_images_base_timeout_s: int = 30,
pdf_to_images_per_slide_timeout_s: int = 8,
notes_options: NotesOptions | None = None,
progress_callback: ProgressCallback | None = None,
) -> SlideDeckResult:
"""Convert a PPTX into rendered images and extracted notes.
@@ -344,8 +408,9 @@ def convert_pptx_to_slidedeck(
full_image_dir = work_dir / "slides_full"
thumbnail_image_dir = work_dir / "slides_thumb"
resolved_notes_options = notes_options or NotesOptions(format=NotesFormat.PLAIN)
_emit_progress(progress_callback, PHASE_EXTRACTING_NOTES, 0, 1)
notes = extract_slide_notes(pptx_path)
notes_plain, notes_html = _extract_slide_notes(pptx_path, options=resolved_notes_options)
_emit_progress(progress_callback, PHASE_EXTRACTING_NOTES, 1, 1)
slide_width, slide_height = _read_slide_size_emu(pptx_path)
output_width, output_height = _infer_output_dimensions_from_slide_size(
@@ -370,7 +435,7 @@ def convert_pptx_to_slidedeck(
output_width_px=thumbnail_width,
output_height_px=thumbnail_height,
)
slide_count = len(notes)
slide_count = len(notes_plain)
pptx_to_pdf_timeout = _compute_adaptive_timeout(
slide_count=slide_count,
timeout_cap_s=pptx_to_pdf_timeout_s,
@@ -454,15 +519,15 @@ def convert_pptx_to_slidedeck(
),
)
if len(full_image_paths) != len(notes):
if len(full_image_paths) != len(notes_plain):
raise ValueError(
"rendered full-size slide count does not match note count: "
f"{len(full_image_paths)} image(s) vs {len(notes)} note entries"
f"{len(full_image_paths)} image(s) vs {len(notes_plain)} note entries"
)
if len(thumbnail_image_paths) != len(notes):
if len(thumbnail_image_paths) != len(notes_plain):
raise ValueError(
"rendered thumbnail slide count does not match note count: "
f"{len(thumbnail_image_paths)} image(s) vs {len(notes)} note entries"
f"{len(thumbnail_image_paths)} image(s) vs {len(notes_plain)} note entries"
)
slides = [
@@ -470,10 +535,11 @@ def convert_pptx_to_slidedeck(
index=index,
image_path=image_path,
thumbnail_path=thumbnail_path,
notes_plain=note,
notes_plain=plain,
notes_html=html,
)
for index, (image_path, thumbnail_path, note) in enumerate(
zip(full_image_paths, thumbnail_image_paths, notes),
for index, (image_path, thumbnail_path, plain, html) in enumerate(
zip(full_image_paths, thumbnail_image_paths, notes_plain, notes_html),
start=1,
)
]
@@ -632,3 +698,102 @@ def _extract_notes_text(shapes: Iterable[object]) -> str:
if text:
segments.append(text)
return "\n\n".join(segments).strip()
def _extract_notes_html(shapes: Iterable[object], *, options: NotesOptions) -> str:
"""Extract sanitized HTML from note shapes while preserving paragraph boundaries."""
paragraphs_html: list[str] = []
for shape in shapes:
text_frame = getattr(shape, "text_frame", None)
if text_frame is None:
continue
for paragraph in getattr(text_frame, "paragraphs", []) or []:
paragraph_html = _paragraph_to_html(paragraph, options=options)
if paragraph_html:
paragraphs_html.append(paragraph_html)
if not paragraphs_html:
return ""
if options.html_use_paragraph_tags:
return "".join(paragraphs_html)
# Flatten paragraph boundaries into <br/> separators (double-break between paragraphs).
return "<br/><br/>".join(
p.removeprefix("<p>").removesuffix("</p>") if p.startswith("<p>") else p
for p in paragraphs_html
)
def _paragraph_to_html(paragraph: object, *, options: NotesOptions) -> str:
policy = options.html_policy
parts: list[str] = []
for run in getattr(paragraph, "runs", []) or []:
text = getattr(run, "text", "") or ""
if not text:
continue
# Escape first, then re-introduce only allowlisted tags.
escaped = html.escape(text, quote=False)
escaped = escaped.replace("\n", "<br/>")
font = getattr(run, "font", None)
style = ""
if font is not None:
if not policy.ignore_color:
rgb_obj = getattr(getattr(font, "color", None), "rgb", None)
if rgb_obj is not None:
rgb = str(rgb_obj)
if isinstance(rgb, str) and len(rgb) == 6:
style += f"color: #{rgb};"
if not policy.ignore_font_size:
size = getattr(font, "size", None)
pt = getattr(size, "pt", None)
if isinstance(pt, (int, float)):
# Clamp to a sane range to avoid pathological CSS.
if 0.5 <= pt <= 512:
pt_str = str(int(pt)) if float(pt).is_integer() else str(round(float(pt), 2))
style += f"font-size: {pt_str}pt;"
content = escaped
if style:
content = f"<span style=\"{style}\">{content}</span>"
if font is not None:
if not policy.ignore_bold and getattr(font, "bold", None) is True:
content = f"<strong>{content}</strong>"
if not policy.ignore_italic and getattr(font, "italic", None) is True:
content = f"<em>{content}</em>"
if not policy.ignore_underline and _truthy_underline(getattr(font, "underline", None)):
content = f"<u>{content}</u>"
if not policy.ignore_strikethrough and _truthy_strikethrough(run):
content = f"<s>{content}</s>"
parts.append(content)
inner = "".join(parts).strip()
if not inner:
return ""
if options.html_use_paragraph_tags:
return f"<p>{inner}</p>"
return inner
def _truthy_underline(value: object) -> bool:
# python-pptx may represent underline as True/False/None or an enum value.
if value is True:
return True
if value in (None, False):
return False
# Any non-falsey, non-None value implies underline style.
return True
def _truthy_strikethrough(run: object) -> bool:
# python-pptx doesn't currently expose a first-class Font.strike property; detect via XML.
xml_run = getattr(run, "_r", None)
rpr = getattr(xml_run, "rPr", None)
if rpr is None:
return False
strike_val = rpr.get("strike")
if strike_val in (None, "noStrike", "false", "0"):
return False
return True
@@ -0,0 +1,111 @@
import tempfile
import unittest
from pathlib import Path
from pptx import Presentation
from pptx.dml.color import RGBColor
from pptx.util import Pt
from officeconvert.conversion import HtmlFormattingPolicy, NotesFormat, NotesOptions, extract_slide_notes_html
def _build_pptx_with_formatted_notes(tmp_path: Path) -> Path:
prs = Presentation()
slide = prs.slides.add_slide(prs.slide_layouts[5]) # blank
notes = slide.notes_slide
tf = notes.notes_text_frame
# First paragraph: mixed runs with formatting.
p1 = tf.paragraphs[0]
p1.text = ""
r1 = p1.add_run()
r1.text = "BoldRed24 "
r1.font.bold = True
r1.font.color.rgb = RGBColor(0xFF, 0x00, 0x00)
r1.font.size = Pt(24)
r2 = p1.add_run()
r2.text = "UnderStrike"
r2.font.underline = True
# python-pptx does not currently expose a first-class API for strikethrough,
# but the underlying DrawingML uses a:rPr@strike="sngStrike" or "dblStrike".
r2._r.rPr.set("strike", "sngStrike")
# Second paragraph: ensure escaping is applied.
p2 = tf.add_paragraph()
r3 = p2.add_run()
r3.text = "<script>alert(1)</script>"
r3.font.italic = True
out = tmp_path / "notes.pptx"
prs.save(out)
return out
class NotesHtmlExtractionTests(unittest.TestCase):
def test_extracts_basic_rich_text_and_escapes(self) -> None:
with tempfile.TemporaryDirectory() as d:
pptx_path = _build_pptx_with_formatted_notes(Path(d))
html_notes = extract_slide_notes_html(
pptx_path,
options=NotesOptions(format=NotesFormat.HTML, html_use_paragraph_tags=True),
)
self.assertEqual(len(html_notes), 1)
html_out = html_notes[0]
# Paragraphs are wrapped in <p>.
self.assertIn("<p>", html_out)
self.assertIn("</p>", html_out)
# Formatting tags.
self.assertIn("<strong>", html_out)
self.assertIn("<em>", html_out)
self.assertIn("<u>", html_out)
self.assertIn("<s>", html_out)
# Style tags for RGB and font size.
self.assertIn('style="color: #FF0000;font-size: 24pt;"', html_out)
# Escaping: no raw <script> tag survives.
self.assertNotIn("<script>", html_out)
self.assertIn("&lt;script&gt;alert(1)&lt;/script&gt;", html_out)
def test_ignore_policy_prunes_styles_and_tags(self) -> None:
with tempfile.TemporaryDirectory() as d:
pptx_path = _build_pptx_with_formatted_notes(Path(d))
html_notes = extract_slide_notes_html(
pptx_path,
options=NotesOptions(
format=NotesFormat.HTML,
html_use_paragraph_tags=True,
html_policy=HtmlFormattingPolicy(
ignore_bold=True,
ignore_color=True,
ignore_font_size=True,
),
),
)
html_out = html_notes[0]
self.assertNotIn("<strong>", html_out)
self.assertNotIn("color:", html_out)
self.assertNotIn("font-size:", html_out)
def test_br_mode_uses_double_break_between_paragraphs(self) -> None:
with tempfile.TemporaryDirectory() as d:
pptx_path = _build_pptx_with_formatted_notes(Path(d))
html_notes = extract_slide_notes_html(
pptx_path,
options=NotesOptions(format=NotesFormat.HTML, html_use_paragraph_tags=False),
)
html_out = html_notes[0]
self.assertNotIn("<p>", html_out)
self.assertIn("<br/><br/>", html_out)
if __name__ == "__main__":
unittest.main()
@@ -23,6 +23,7 @@ class ConversionSession:
thumbnail_resolution: conversion_pb2.ConversionResolution
full_jpeg_quality: int
thumbnail_jpeg_quality: int
notes: conversion_pb2.NotesOptions | None = None
bucket_name: str
upload_object_key: str
status: conversion_pb2.ConversionStatus
@@ -18,6 +18,9 @@ from google.protobuf.timestamp_pb2 import Timestamp
from officeconvert import SlideArtifact, convert_pptx_to_slidedeck
from officeconvert.conversion import (
ConversionTimeoutError,
HtmlFormattingPolicy,
NotesFormat,
NotesOptions,
PHASE_EXTRACTING_NOTES,
PHASE_PDF_TO_IMAGES,
PHASE_PPTX_TO_PDF,
@@ -49,6 +52,37 @@ _DEFAULT_FULL_RESOLUTION = conversion_pb2.CONVERSION_RESOLUTION_FHD
_DEFAULT_THUMBNAIL_RESOLUTION = conversion_pb2.CONVERSION_RESOLUTION_SD
_DEFAULT_FULL_JPEG_QUALITY = 85
_DEFAULT_THUMBNAIL_JPEG_QUALITY = 75
_DEFAULT_NOTES_FORMAT = conversion_pb2.NOTES_FORMAT_PLAIN
_DEFAULT_HTML_USE_PARAGRAPH_TAGS = True
def _to_library_notes_options(
notes: conversion_pb2.NotesOptions | None,
) -> NotesOptions | None:
if notes is None:
return None
fmt = notes.format or _DEFAULT_NOTES_FORMAT
library_format = NotesFormat.HTML if fmt == conversion_pb2.NOTES_FORMAT_HTML else NotesFormat.PLAIN
html_use_paragraph_tags = _DEFAULT_HTML_USE_PARAGRAPH_TAGS
if notes.HasField("html_use_paragraph_tags"):
html_use_paragraph_tags = bool(notes.html_use_paragraph_tags)
policy_proto = notes.html_policy
policy = HtmlFormattingPolicy(
ignore_bold=bool(policy_proto.ignore_bold),
ignore_italic=bool(policy_proto.ignore_italic),
ignore_underline=bool(policy_proto.ignore_underline),
ignore_strikethrough=bool(policy_proto.ignore_strikethrough),
ignore_font_size=bool(policy_proto.ignore_font_size),
ignore_color=bool(policy_proto.ignore_color),
)
return NotesOptions(
format=library_format,
html_use_paragraph_tags=html_use_paragraph_tags,
html_policy=policy,
)
class ConversionServiceImpl(conversion_connect.ConversionService):
@@ -124,6 +158,7 @@ class ConversionServiceImpl(conversion_connect.ConversionService):
thumbnail_resolution=thumbnail_resolution,
full_jpeg_quality=full_jpeg_quality,
thumbnail_jpeg_quality=thumbnail_jpeg_quality,
notes=request.notes if request.HasField("notes") else None,
bucket_name=bucket_name,
upload_object_key=upload_key,
status=conversion_pb2.CONVERSION_STATUS_PENDING,
@@ -280,6 +315,7 @@ class ConversionServiceImpl(conversion_connect.ConversionService):
pptx_to_pdf_per_slide_timeout_s=self._config.conversion_pptx_to_pdf_per_slide_timeout_seconds,
pdf_to_images_base_timeout_s=self._config.conversion_pdf_to_images_base_timeout_seconds,
pdf_to_images_per_slide_timeout_s=self._config.conversion_pdf_to_images_per_slide_timeout_seconds,
notes_options=_to_library_notes_options(session.notes),
progress_callback=lambda phase_name, current, max_value: self._set_session_progress_from_name(
session,
phase_name=phase_name,
@@ -428,6 +464,7 @@ class ConversionServiceImpl(conversion_connect.ConversionService):
conversion_pb2.Slide(
index=slide.index,
notes_plain=slide.notes_plain,
notes_html=slide.notes_html,
image_url=image_url,
thumbnail_image_url=thumbnail_image_url,
)