add rich output support for slide notes

2026-05-07 10:35:37 -07:00
parent 500b767d58
commit 06d4122e4e
11 changed files with 831 additions and 140 deletions
@@ -1,19 +1,27 @@
 """Public conversion APIs for the officeconvert Python library."""

 from officeconvert.conversion import (
+    HtmlFormattingPolicy,
+    NotesFormat,
+    NotesOptions,
    SlideArtifact,
    SlideDeckResult,
    convert_pptx_to_pdf,
    convert_pptx_to_slidedeck,
    extract_slide_notes,
+    extract_slide_notes_html,
    render_pdf_to_images,
 )

 __all__ = [
+    "HtmlFormattingPolicy",
+    "NotesFormat",
+    "NotesOptions",
    "SlideArtifact",
    "SlideDeckResult",
    "convert_pptx_to_pdf",
    "convert_pptx_to_slidedeck",
    "extract_slide_notes",
+    "extract_slide_notes_html",
    "render_pdf_to_images",
 ]
@@ -4,6 +4,8 @@ from __future__ import annotations

 from collections.abc import Callable
 from dataclasses import dataclass
+from enum import Enum
+import html
 import logging
 import math
 from pathlib import Path
@@ -13,6 +15,28 @@ from typing import Iterable
 from pptx import Presentation


+class NotesFormat(str, Enum):
+    PLAIN = "plain"
+    HTML = "html"
+
+
+@dataclass(frozen=True, slots=True)
+class HtmlFormattingPolicy:
+    ignore_bold: bool = False
+    ignore_italic: bool = False
+    ignore_underline: bool = False
+    ignore_strikethrough: bool = False
+    ignore_font_size: bool = False
+    ignore_color: bool = False
+
+
+@dataclass(frozen=True, slots=True)
+class NotesOptions:
+    format: NotesFormat = NotesFormat.PLAIN
+    html_use_paragraph_tags: bool = True
+    html_policy: HtmlFormattingPolicy = HtmlFormattingPolicy()
+
+
@dataclass(frozen=True, slots=True)
 class SlideArtifact:
    """Represents one converted slide image and its extracted notes."""
@@ -21,6 +45,7 @@ class SlideArtifact:
    image_path: Path
    thumbnail_path: Path
    notes_plain: str
+    notes_html: str = ""


@dataclass(frozen=True, slots=True)
@@ -290,14 +315,52 @@ def extract_slide_notes(pptx_path: Path) -> list[str]:
    if not pptx_path.exists():
        raise FileNotFoundError(f"source PPTX does not exist: {pptx_path}")

+    plain, _html = _extract_slide_notes(pptx_path, options=NotesOptions(format=NotesFormat.PLAIN))
+    return plain
+
+
+def extract_slide_notes_html(
+    pptx_path: Path,
+    *,
+    options: NotesOptions | None = None,
+) -> list[str]:
+    """Extract sanitized HTML notes for each slide in slide index order.
+
+    The returned HTML is sanitized-by-construction:
+    - text content is always escaped
+    - only a small allowlist of tags is emitted: p, br, strong, em, u, s, span
+    - only `style` attributes generated by this function are emitted on span tags
+    """
+    if not pptx_path.exists():
+        raise FileNotFoundError(f"source PPTX does not exist: {pptx_path}")
+    resolved = options or NotesOptions(format=NotesFormat.HTML)
+    if resolved.format != NotesFormat.HTML:
+        raise ValueError("extract_slide_notes_html requires NotesOptions.format=NotesFormat.HTML")
+
+    _plain, html_notes = _extract_slide_notes(pptx_path, options=resolved)
+    return html_notes
+
+
+def _extract_slide_notes(
+    pptx_path: Path,
+    *,
+    options: NotesOptions,
+) -> tuple[list[str], list[str]]:
    presentation = Presentation(str(pptx_path.resolve()))
-    notes: list[str] = []
+    want_html = options.format == NotesFormat.HTML
+
+    plain_notes: list[str] = []
+    html_notes: list[str] = []
    for slide in presentation.slides:
        if not slide.has_notes_slide:
-            notes.append("")
+            plain_notes.append("")
+            html_notes.append("")
            continue
-        notes.append(_extract_notes_text(slide.notes_slide.shapes))
-    return notes
+        plain = _extract_notes_text(slide.notes_slide.shapes)
+        plain_notes.append(plain)
+        html_notes.append(_extract_notes_html(slide.notes_slide.shapes, options=options) if want_html else "")
+
+    return plain_notes, html_notes


 def convert_pptx_to_slidedeck(
@@ -314,6 +377,7 @@ def convert_pptx_to_slidedeck(
    pptx_to_pdf_per_slide_timeout_s: int = 3,
    pdf_to_images_base_timeout_s: int = 30,
    pdf_to_images_per_slide_timeout_s: int = 8,
+    notes_options: NotesOptions | None = None,
    progress_callback: ProgressCallback | None = None,
 ) -> SlideDeckResult:
    """Convert a PPTX into rendered images and extracted notes.
@@ -344,8 +408,9 @@ def convert_pptx_to_slidedeck(
    full_image_dir = work_dir / "slides_full"
    thumbnail_image_dir = work_dir / "slides_thumb"

+    resolved_notes_options = notes_options or NotesOptions(format=NotesFormat.PLAIN)
    _emit_progress(progress_callback, PHASE_EXTRACTING_NOTES, 0, 1)
-    notes = extract_slide_notes(pptx_path)
+    notes_plain, notes_html = _extract_slide_notes(pptx_path, options=resolved_notes_options)
    _emit_progress(progress_callback, PHASE_EXTRACTING_NOTES, 1, 1)
    slide_width, slide_height = _read_slide_size_emu(pptx_path)
    output_width, output_height = _infer_output_dimensions_from_slide_size(
@@ -370,7 +435,7 @@ def convert_pptx_to_slidedeck(
        output_width_px=thumbnail_width,
        output_height_px=thumbnail_height,
    )
-    slide_count = len(notes)
+    slide_count = len(notes_plain)
    pptx_to_pdf_timeout = _compute_adaptive_timeout(
        slide_count=slide_count,
        timeout_cap_s=pptx_to_pdf_timeout_s,
@@ -454,15 +519,15 @@ def convert_pptx_to_slidedeck(
        ),
    )

-    if len(full_image_paths) != len(notes):
+    if len(full_image_paths) != len(notes_plain):
        raise ValueError(
            "rendered full-size slide count does not match note count: "
-            f"{len(full_image_paths)} image(s) vs {len(notes)} note entries"
+            f"{len(full_image_paths)} image(s) vs {len(notes_plain)} note entries"
        )
-    if len(thumbnail_image_paths) != len(notes):
+    if len(thumbnail_image_paths) != len(notes_plain):
        raise ValueError(
            "rendered thumbnail slide count does not match note count: "
-            f"{len(thumbnail_image_paths)} image(s) vs {len(notes)} note entries"
+            f"{len(thumbnail_image_paths)} image(s) vs {len(notes_plain)} note entries"
        )

    slides = [
@@ -470,10 +535,11 @@ def convert_pptx_to_slidedeck(
            index=index,
            image_path=image_path,
            thumbnail_path=thumbnail_path,
-            notes_plain=note,
+            notes_plain=plain,
+            notes_html=html,
        )
-        for index, (image_path, thumbnail_path, note) in enumerate(
-            zip(full_image_paths, thumbnail_image_paths, notes),
+        for index, (image_path, thumbnail_path, plain, html) in enumerate(
+            zip(full_image_paths, thumbnail_image_paths, notes_plain, notes_html),
            start=1,
        )
    ]
@@ -632,3 +698,102 @@ def _extract_notes_text(shapes: Iterable[object]) -> str:
        if text:
            segments.append(text)
    return "\n\n".join(segments).strip()
+
+
+def _extract_notes_html(shapes: Iterable[object], *, options: NotesOptions) -> str:
+    """Extract sanitized HTML from note shapes while preserving paragraph boundaries."""
+    paragraphs_html: list[str] = []
+    for shape in shapes:
+        text_frame = getattr(shape, "text_frame", None)
+        if text_frame is None:
+            continue
+        for paragraph in getattr(text_frame, "paragraphs", []) or []:
+            paragraph_html = _paragraph_to_html(paragraph, options=options)
+            if paragraph_html:
+                paragraphs_html.append(paragraph_html)
+
+    if not paragraphs_html:
+        return ""
+    if options.html_use_paragraph_tags:
+        return "".join(paragraphs_html)
+    # Flatten paragraph boundaries into <br/> separators (double-break between paragraphs).
+    return "<br/><br/>".join(
+        p.removeprefix("<p>").removesuffix("</p>") if p.startswith("<p>") else p
+        for p in paragraphs_html
+    )
+
+
+def _paragraph_to_html(paragraph: object, *, options: NotesOptions) -> str:
+    policy = options.html_policy
+
+    parts: list[str] = []
+    for run in getattr(paragraph, "runs", []) or []:
+        text = getattr(run, "text", "") or ""
+        if not text:
+            continue
+        # Escape first, then re-introduce only allowlisted tags.
+        escaped = html.escape(text, quote=False)
+        escaped = escaped.replace("\n", "<br/>")
+
+        font = getattr(run, "font", None)
+        style = ""
+        if font is not None:
+            if not policy.ignore_color:
+                rgb_obj = getattr(getattr(font, "color", None), "rgb", None)
+                if rgb_obj is not None:
+                    rgb = str(rgb_obj)
+                    if isinstance(rgb, str) and len(rgb) == 6:
+                        style += f"color: #{rgb};"
+            if not policy.ignore_font_size:
+                size = getattr(font, "size", None)
+                pt = getattr(size, "pt", None)
+                if isinstance(pt, (int, float)):
+                    # Clamp to a sane range to avoid pathological CSS.
+                    if 0.5 <= pt <= 512:
+                        pt_str = str(int(pt)) if float(pt).is_integer() else str(round(float(pt), 2))
+                        style += f"font-size: {pt_str}pt;"
+
+        content = escaped
+        if style:
+            content = f"<span style=\"{style}\">{content}</span>"
+
+        if font is not None:
+            if not policy.ignore_bold and getattr(font, "bold", None) is True:
+                content = f"<strong>{content}</strong>"
+            if not policy.ignore_italic and getattr(font, "italic", None) is True:
+                content = f"<em>{content}</em>"
+            if not policy.ignore_underline and _truthy_underline(getattr(font, "underline", None)):
+                content = f"<u>{content}</u>"
+            if not policy.ignore_strikethrough and _truthy_strikethrough(run):
+                content = f"<s>{content}</s>"
+
+        parts.append(content)
+
+    inner = "".join(parts).strip()
+    if not inner:
+        return ""
+    if options.html_use_paragraph_tags:
+        return f"<p>{inner}</p>"
+    return inner
+
+
+def _truthy_underline(value: object) -> bool:
+    # python-pptx may represent underline as True/False/None or an enum value.
+    if value is True:
+        return True
+    if value in (None, False):
+        return False
+    # Any non-falsey, non-None value implies underline style.
+    return True
+
+
+def _truthy_strikethrough(run: object) -> bool:
+    # python-pptx doesn't currently expose a first-class Font.strike property; detect via XML.
+    xml_run = getattr(run, "_r", None)
+    rpr = getattr(xml_run, "rPr", None)
+    if rpr is None:
+        return False
+    strike_val = rpr.get("strike")
+    if strike_val in (None, "noStrike", "false", "0"):
+        return False
+    return True
@@ -0,0 +1,111 @@
+import tempfile
+import unittest
+from pathlib import Path
+
+from pptx import Presentation
+from pptx.dml.color import RGBColor
+from pptx.util import Pt
+
+from officeconvert.conversion import HtmlFormattingPolicy, NotesFormat, NotesOptions, extract_slide_notes_html
+
+
+def _build_pptx_with_formatted_notes(tmp_path: Path) -> Path:
+    prs = Presentation()
+    slide = prs.slides.add_slide(prs.slide_layouts[5])  # blank
+
+    notes = slide.notes_slide
+    tf = notes.notes_text_frame
+
+    # First paragraph: mixed runs with formatting.
+    p1 = tf.paragraphs[0]
+    p1.text = ""
+    r1 = p1.add_run()
+    r1.text = "BoldRed24 "
+    r1.font.bold = True
+    r1.font.color.rgb = RGBColor(0xFF, 0x00, 0x00)
+    r1.font.size = Pt(24)
+
+    r2 = p1.add_run()
+    r2.text = "UnderStrike"
+    r2.font.underline = True
+    # python-pptx does not currently expose a first-class API for strikethrough,
+    # but the underlying DrawingML uses a:rPr@strike="sngStrike" or "dblStrike".
+    r2._r.rPr.set("strike", "sngStrike")
+
+    # Second paragraph: ensure escaping is applied.
+    p2 = tf.add_paragraph()
+    r3 = p2.add_run()
+    r3.text = "<script>alert(1)</script>"
+    r3.font.italic = True
+
+    out = tmp_path / "notes.pptx"
+    prs.save(out)
+    return out
+
+
+class NotesHtmlExtractionTests(unittest.TestCase):
+    def test_extracts_basic_rich_text_and_escapes(self) -> None:
+        with tempfile.TemporaryDirectory() as d:
+            pptx_path = _build_pptx_with_formatted_notes(Path(d))
+            html_notes = extract_slide_notes_html(
+                pptx_path,
+                options=NotesOptions(format=NotesFormat.HTML, html_use_paragraph_tags=True),
+            )
+
+        self.assertEqual(len(html_notes), 1)
+        html_out = html_notes[0]
+
+        # Paragraphs are wrapped in <p>.
+        self.assertIn("<p>", html_out)
+        self.assertIn("</p>", html_out)
+
+        # Formatting tags.
+        self.assertIn("<strong>", html_out)
+        self.assertIn("<em>", html_out)
+        self.assertIn("<u>", html_out)
+        self.assertIn("<s>", html_out)
+
+        # Style tags for RGB and font size.
+        self.assertIn('style="color: #FF0000;font-size: 24pt;"', html_out)
+
+        # Escaping: no raw <script> tag survives.
+        self.assertNotIn("<script>", html_out)
+        self.assertIn("&lt;script&gt;alert(1)&lt;/script&gt;", html_out)
+
+    def test_ignore_policy_prunes_styles_and_tags(self) -> None:
+        with tempfile.TemporaryDirectory() as d:
+            pptx_path = _build_pptx_with_formatted_notes(Path(d))
+            html_notes = extract_slide_notes_html(
+                pptx_path,
+                options=NotesOptions(
+                    format=NotesFormat.HTML,
+                    html_use_paragraph_tags=True,
+                    html_policy=HtmlFormattingPolicy(
+                        ignore_bold=True,
+                        ignore_color=True,
+                        ignore_font_size=True,
+                    ),
+                ),
+            )
+
+        html_out = html_notes[0]
+        self.assertNotIn("<strong>", html_out)
+        self.assertNotIn("color:", html_out)
+        self.assertNotIn("font-size:", html_out)
+
+    def test_br_mode_uses_double_break_between_paragraphs(self) -> None:
+        with tempfile.TemporaryDirectory() as d:
+            pptx_path = _build_pptx_with_formatted_notes(Path(d))
+            html_notes = extract_slide_notes_html(
+                pptx_path,
+                options=NotesOptions(format=NotesFormat.HTML, html_use_paragraph_tags=False),
+            )
+
+        html_out = html_notes[0]
+        self.assertNotIn("<p>", html_out)
+        self.assertIn("<br/><br/>", html_out)
+
+
+if __name__ == "__main__":
+    unittest.main()
+
@@ -23,6 +23,7 @@ class ConversionSession:
    thumbnail_resolution: conversion_pb2.ConversionResolution
    full_jpeg_quality: int
    thumbnail_jpeg_quality: int
+    notes: conversion_pb2.NotesOptions | None = None
    bucket_name: str
    upload_object_key: str
    status: conversion_pb2.ConversionStatus
@@ -18,6 +18,9 @@ from google.protobuf.timestamp_pb2 import Timestamp
 from officeconvert import SlideArtifact, convert_pptx_to_slidedeck
 from officeconvert.conversion import (
    ConversionTimeoutError,
+    HtmlFormattingPolicy,
+    NotesFormat,
+    NotesOptions,
    PHASE_EXTRACTING_NOTES,
    PHASE_PDF_TO_IMAGES,
    PHASE_PPTX_TO_PDF,
@@ -49,6 +52,37 @@ _DEFAULT_FULL_RESOLUTION = conversion_pb2.CONVERSION_RESOLUTION_FHD
 _DEFAULT_THUMBNAIL_RESOLUTION = conversion_pb2.CONVERSION_RESOLUTION_SD
 _DEFAULT_FULL_JPEG_QUALITY = 85
 _DEFAULT_THUMBNAIL_JPEG_QUALITY = 75
+_DEFAULT_NOTES_FORMAT = conversion_pb2.NOTES_FORMAT_PLAIN
+_DEFAULT_HTML_USE_PARAGRAPH_TAGS = True
+
+
+def _to_library_notes_options(
+    notes: conversion_pb2.NotesOptions | None,
+) -> NotesOptions | None:
+    if notes is None:
+        return None
+
+    fmt = notes.format or _DEFAULT_NOTES_FORMAT
+    library_format = NotesFormat.HTML if fmt == conversion_pb2.NOTES_FORMAT_HTML else NotesFormat.PLAIN
+
+    html_use_paragraph_tags = _DEFAULT_HTML_USE_PARAGRAPH_TAGS
+    if notes.HasField("html_use_paragraph_tags"):
+        html_use_paragraph_tags = bool(notes.html_use_paragraph_tags)
+
+    policy_proto = notes.html_policy
+    policy = HtmlFormattingPolicy(
+        ignore_bold=bool(policy_proto.ignore_bold),
+        ignore_italic=bool(policy_proto.ignore_italic),
+        ignore_underline=bool(policy_proto.ignore_underline),
+        ignore_strikethrough=bool(policy_proto.ignore_strikethrough),
+        ignore_font_size=bool(policy_proto.ignore_font_size),
+        ignore_color=bool(policy_proto.ignore_color),
+    )
+    return NotesOptions(
+        format=library_format,
+        html_use_paragraph_tags=html_use_paragraph_tags,
+        html_policy=policy,
+    )


 class ConversionServiceImpl(conversion_connect.ConversionService):
@@ -124,6 +158,7 @@ class ConversionServiceImpl(conversion_connect.ConversionService):
            thumbnail_resolution=thumbnail_resolution,
            full_jpeg_quality=full_jpeg_quality,
            thumbnail_jpeg_quality=thumbnail_jpeg_quality,
+            notes=request.notes if request.HasField("notes") else None,
            bucket_name=bucket_name,
            upload_object_key=upload_key,
            status=conversion_pb2.CONVERSION_STATUS_PENDING,
@@ -280,6 +315,7 @@ class ConversionServiceImpl(conversion_connect.ConversionService):
                pptx_to_pdf_per_slide_timeout_s=self._config.conversion_pptx_to_pdf_per_slide_timeout_seconds,
                pdf_to_images_base_timeout_s=self._config.conversion_pdf_to_images_base_timeout_seconds,
                pdf_to_images_per_slide_timeout_s=self._config.conversion_pdf_to_images_per_slide_timeout_seconds,
+                notes_options=_to_library_notes_options(session.notes),
                progress_callback=lambda phase_name, current, max_value: self._set_session_progress_from_name(
                    session,
                    phase_name=phase_name,
@@ -428,6 +464,7 @@ class ConversionServiceImpl(conversion_connect.ConversionService):
                conversion_pb2.Slide(
                    index=slide.index,
                    notes_plain=slide.notes_plain,
+                    notes_html=slide.notes_html,
                    image_url=image_url,
                    thumbnail_image_url=thumbnail_image_url,
                )