From 3951f7f91dddc64136b949812bf2534759fbd94b Mon Sep 17 00:00:00 2001 From: fujie Date: Sun, 4 Jan 2026 02:24:46 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=A2=9E=E5=BC=BA=20Word=20=E5=AF=BC?= =?UTF-8?q?=E5=87=BA=E6=8F=92=E4=BB=B6=EF=BC=8C=E6=94=AF=E6=8C=81=E5=8E=9F?= =?UTF-8?q?=E7=94=9F=E6=95=B0=E5=AD=A6=E5=85=AC=E5=BC=8F=E3=80=81Mermaid?= =?UTF-8?q?=20=E5=9B=BE=E8=A1=A8=E3=80=81=E5=BC=95=E7=94=A8=E3=80=81?= =?UTF-8?q?=E9=AB=98=E7=BA=A7=E8=A1=A8=E6=A0=BC=E6=A0=BC=E5=BC=8F=E5=8F=8A?= =?UTF-8?q?=E5=89=A5=E7=A6=BB=E6=8E=A8=E7=90=86=E5=9D=97=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/plugins/actions/index.md | 6 +- docs/plugins/actions/index.zh.md | 6 +- plugins/actions/export_to_docx/README.md | 75 +- plugins/actions/export_to_docx/README_CN.md | 80 +- .../actions/export_to_docx/export_to_word.py | 1180 +++++++++++++++-- .../export_to_docx/export_to_word_cn.py | 1172 ++++++++++++++-- 6 files changed, 2238 insertions(+), 281 deletions(-) diff --git a/docs/plugins/actions/index.md b/docs/plugins/actions/index.md index a2df935..4627499 100644 --- a/docs/plugins/actions/index.md +++ b/docs/plugins/actions/index.md @@ -57,13 +57,13 @@ Actions are interactive plugins that: [:octicons-arrow-right-24: Documentation](export-to-excel.md) -- :material-file-word-box:{ .lg .middle } **Export to Word** +- :material-file-word-box:{ .lg .middle } **Export to Word (Enhanced Formatting)** --- - Export chat content as Word (.docx) with Markdown formatting and syntax highlighting. + Export the current conversation to a formatted Word doc with syntax highlighting, AI-generated titles, and perfect Markdown rendering (tables, quotes, lists). - **Version:** 0.1.0 + **Version:** 0.1.1 [:octicons-arrow-right-24: Documentation](export-to-word.md) diff --git a/docs/plugins/actions/index.zh.md b/docs/plugins/actions/index.zh.md index 93474dc..5823b01 100644 --- a/docs/plugins/actions/index.zh.md +++ b/docs/plugins/actions/index.zh.md @@ -57,13 +57,13 @@ Actions 是交互式插件,能够: [:octicons-arrow-right-24: 查看文档](export-to-excel.md) -- :material-file-word-box:{ .lg .middle } **Export to Word** +- :material-file-word-box:{ .lg .middle } **Word 导出 (格式增强)** --- - 将聊天内容按 Markdown 格式导出为 Word (.docx),支持语法高亮。 + 将当前对话导出为完美格式的 Word 文档,支持代码语法高亮、AI 智能标题生成以及表格、引用等 Markdown 元素的精准渲染。 - **版本:** 0.1.0 + **版本:** 0.1.1 [:octicons-arrow-right-24: 查看文档](export-to-word.md) diff --git a/plugins/actions/export_to_docx/README.md b/plugins/actions/export_to_docx/README.md index fd78ab4..238c718 100644 --- a/plugins/actions/export_to_docx/README.md +++ b/plugins/actions/export_to_docx/README.md @@ -1,14 +1,19 @@ # Export to Word -Export current conversation from Markdown to Word (.docx) with **syntax highlighting**, **blockquote support**, and smarter filenames. +Export conversation to Word (.docx) with **syntax highlighting**, **native math equations**, **Mermaid diagrams**, **citations**, and **enhanced table formatting**. ## Features - **One-Click Export**: Adds an "Export to Word" action button to the chat. - **Markdown Conversion**: Converts Markdown syntax to Word formatting (headings, bold, italic, code, tables, lists). - **Syntax Highlighting**: Code blocks are highlighted with Pygments (supports 500+ languages). +- **Native Math Equations**: LaTeX math (`$$...$$`, `\[...\]`, `$...$`, `\(...\)`) converted to editable Word equations. +- **Mermaid Diagrams**: Mermaid flowcharts and sequence diagrams rendered as images in the document. +- **Citations & References**: Auto-generates a References section from OpenWebUI sources with clickable citation links. +- **Reasoning Stripping**: Automatically removes AI thinking blocks (``, ``) from exports. +- **Enhanced Tables**: Smart column widths, column alignment (`:---`, `---:`, `:---:`), header row repeat across pages. - **Blockquote Support**: Markdown blockquotes are rendered with left border and gray styling. -- **Multi-language Support**: Properly handles both Chinese and English text without garbled characters. +- **Multi-language Support**: Properly handles both Chinese and English text. - **Smarter Filenames**: Configurable title source (Chat Title, AI Generated, or Markdown Title). ## Configuration @@ -19,24 +24,33 @@ You can configure the following settings via the **Valves** button in the plugin - `chat_title`: Use the conversation title (default). - `ai_generated`: Use AI to generate a short title based on the content. - `markdown_title`: Extract the first h1/h2 heading from the Markdown content. +- **MERMAID_JS_URL**: URL for the Mermaid.js library (for diagram rendering). +- **MERMAID_PNG_SCALE**: Scale factor for Mermaid PNG generation (Resolution). Default: `3.0`. +- **MERMAID_DISPLAY_SCALE**: Scale factor for Mermaid visual size in Word. Default: `1.5`. +- **MERMAID_OPTIMIZE_LAYOUT**: Automatically convert LR (Left-Right) flowcharts to TD (Top-Down). Default: `True`. +- **MERMAID_CAPTIONS_ENABLE**: Enable/disable figure captions for Mermaid diagrams. ## Supported Markdown Syntax -| Syntax | Word Result | -| :---------------------------------- | :-------------------------------- | -| `# Heading 1` to `###### Heading 6` | Heading levels 1-6 | -| `**bold**` or `__bold__` | Bold text | -| `*italic*` or `_italic_` | Italic text | -| `***bold italic***` | Bold + Italic | -| `` `inline code` `` | Monospace with gray background | -| ` ``` code block ``` ` | **Syntax highlighted** code block | -| `> blockquote` | Left-bordered gray italic text | -| `[link](url)` | Blue underlined link text | -| `~~strikethrough~~` | Strikethrough text | -| `- item` or `* item` | Bullet list | -| `1. item` | Numbered list | -| Markdown tables | Table with grid | -| `---` or `***` | Horizontal rule | +| Syntax | Word Result | +| :---------------------------------- | :------------------------------------ | +| `# Heading 1` to `###### Heading 6` | Heading levels 1-6 | +| `**bold**` or `__bold__` | Bold text | +| `*italic*` or `_italic_` | Italic text | +| `***bold italic***` | Bold + Italic | +| `` `inline code` `` | Monospace with gray background | +| ` ``` code block ``` ` | **Syntax highlighted** code block | +| `> blockquote` | Left-bordered gray italic text | +| `[link](url)` | Blue underlined link text | +| `~~strikethrough~~` | Strikethrough text | +| `- item` or `* item` | Bullet list | +| `1. item` | Numbered list | +| Markdown tables | **Enhanced table** with smart widths | +| `---` or `***` | Horizontal rule | +| `$$LaTeX$$` or `\[LaTeX\]` | **Native Word equation** (display) | +| `$LaTeX$` or `\(LaTeX\)` | **Native Word equation** (inline) | +| ` ```mermaid ... ``` ` | **Mermaid diagram** as image | +| `[1]` citation markers | **Clickable links** to References | ## Usage @@ -44,19 +58,14 @@ You can configure the following settings via the **Valves** button in the plugin 2. In any chat, click the "Export to Word" button. 3. The .docx file will be automatically downloaded to your device. - -### Notes - -- Title detection only considers h1/h2 headings. -- If the request carries `chat_id` (body or metadata), the plugin will fetch the chat title from the database when the body lacks one. -- Default fonts: Times New Roman (en), SimSun/SimHei (zh), Consolas (code). - -### Requirements +## Requirements - `python-docx==1.1.2` - Word document generation -- `Pygments>=2.15.0` - Syntax highlighting (optional but recommended) +- `Pygments>=2.15.0` - Syntax highlighting +- `latex2mathml` - LaTeX to MathML conversion +- `mathml2omml` - MathML to Office Math (OMML) conversion -Both are declared in the plugin docstring; ensure they are installed in your environment. +All dependencies are declared in the plugin docstring. ## Font Configuration @@ -64,6 +73,18 @@ Both are declared in the plugin docstring; ensure they are installed in your env - **Chinese Text**: SimSun (宋体) for body, SimHei (黑体) for headings - **Code**: Consolas +## Changelog + +### v0.2.0 +- Added native math equation support (LaTeX → OMML) +- Added Mermaid diagram rendering +- Added citations and references section generation +- Added automatic reasoning block stripping +- Enhanced table formatting with smart column widths and alignment + +### v0.1.1 +- Initial release with basic Markdown to Word conversion + ## Author Fu-Jie diff --git a/plugins/actions/export_to_docx/README_CN.md b/plugins/actions/export_to_docx/README_CN.md index 4482257..bdf9eed 100644 --- a/plugins/actions/export_to_docx/README_CN.md +++ b/plugins/actions/export_to_docx/README_CN.md @@ -1,17 +1,22 @@ # 导出为 Word -将当前对话内容从 Markdown 转换并导出为 Word (.docx) 文件,支持**代码语法高亮**、**引用块样式**和更智能的文件命名。 +将对话导出为 Word (.docx),支持**代码语法高亮**、**原生数学公式**、**Mermaid 图表**、**引用参考**和**增强表格格式**。 ## 功能特点 -- **一键导出**:在聊天界面添加“导出为 Word”动作按钮。 +- **一键导出**:在聊天界面添加"导出为 Word"动作按钮。 - **Markdown 转换**:将 Markdown 语法转换为 Word 格式(标题、粗体、斜体、代码、表格、列表)。 - **代码语法高亮**:使用 Pygments 库为代码块添加语法高亮(支持 500+ 种语言)。 -- **引用块支持**:Markdown 引用块会渲染为带左侧边框的灰色斜体样式。 +- **原生数学公式**:LaTeX 公式(`$$...$$`、`\[...\]`、`$...$`、`\(...\)`)转换为可编辑的 Word 公式。 +- **Mermaid 图表**:Mermaid 流程图和时序图渲染为文档中的图片。 +- **引用与参考**:自动从 OpenWebUI 来源生成参考资料章节,支持可点击的引用链接。 +- **移除思考过程**:自动移除 AI 思考块(``、``)。 +- **增强表格**:智能列宽、列对齐(`:---`、`---:`、`:---:`)、表头跨页重复。 +- **引用块支持**:Markdown 引用块渲染为带左侧边框的灰色斜体样式。 - **多语言支持**:正确处理中文和英文文本,无乱码问题。 -- **更智能的文件名**:可配置标题来源(对话标题、AI 生成或 Markdown 标题)。 +- **智能文件名**:可配置标题来源(对话标题、AI 生成或 Markdown 标题)。 -## 配置 (Configuration) +## 配置 您可以通过插件设置中的 **Valves** 按钮配置以下选项: @@ -19,24 +24,33 @@ - `chat_title`:使用对话标题(默认)。 - `ai_generated`:使用 AI 根据内容生成简短标题。 - `markdown_title`:从 Markdown 内容中提取第一个一级或二级标题。 +- **MERMAID_JS_URL**:Mermaid.js 库的 URL(用于图表渲染)。 +- **MERMAID_PNG_SCALE**:Mermaid PNG 生成缩放比例(分辨率)。默认:`3.0`。 +- **MERMAID_DISPLAY_SCALE**:Mermaid 在 Word 中的显示比例(视觉大小)。默认:`1.5`。 +- **MERMAID_OPTIMIZE_LAYOUT**:自动将 LR(左右)流程图转换为 TD(上下)。默认:`True`。 +- **MERMAID_CAPTIONS_ENABLE**:启用/禁用 Mermaid 图表的图注。 ## 支持的 Markdown 语法 -| 语法 | Word 效果 | -| :-------------------------- | :----------------------- | -| `# 标题1` 到 `###### 标题6` | 标题级别 1-6 | -| `**粗体**` 或 `__粗体__` | 粗体文本 | -| `*斜体*` 或 `_斜体_` | 斜体文本 | -| `***粗斜体***` | 粗体 + 斜体 | -| `` `行内代码` `` | 等宽字体 + 灰色背景 | -| ` ``` 代码块 ``` ` | **语法高亮**的代码块 | -| `> 引用文本` | 带左侧边框的灰色斜体文本 | -| `[链接](url)` | 蓝色下划线链接文本 | -| `~~删除线~~` | 删除线文本 | -| `- 项目` 或 `* 项目` | 无序列表 | -| `1. 项目` | 有序列表 | -| Markdown 表格 | 带边框表格 | -| `---` 或 `***` | 水平分割线 | +| 语法 | Word 效果 | +| :---------------------------- | :-------------------------------- | +| `# 标题1` 到 `###### 标题6` | 标题级别 1-6 | +| `**粗体**` 或 `__粗体__` | 粗体文本 | +| `*斜体*` 或 `_斜体_` | 斜体文本 | +| `***粗斜体***` | 粗体 + 斜体 | +| `` `行内代码` `` | 等宽字体 + 灰色背景 | +| ` ``` 代码块 ``` ` | **语法高亮**的代码块 | +| `> 引用文本` | 带左侧边框的灰色斜体文本 | +| `[链接](url)` | 蓝色下划线链接文本 | +| `~~删除线~~` | 删除线文本 | +| `- 项目` 或 `* 项目` | 无序列表 | +| `1. 项目` | 有序列表 | +| Markdown 表格 | **增强表格**(智能列宽) | +| `---` 或 `***` | 水平分割线 | +| `$$LaTeX$$` 或 `\[LaTeX\]` | **原生 Word 公式**(块级) | +| `$LaTeX$` 或 `\(LaTeX\)` | **原生 Word 公式**(行内) | +| ` ```mermaid ... ``` ` | **Mermaid 图表**(图片形式) | +| `[1]` 引用标记 | **可点击链接**到参考资料 | ## 使用方法 @@ -44,18 +58,14 @@ 2. 在任意对话中,点击"导出为 Word"按钮。 3. .docx 文件将自动下载到你的设备。 -### 说明 - -- 标题检测仅考虑一级/二级标题(h1/h2)。 -- 若请求体或 metadata 提供 `chat_id`,当正文缺少标题时会从数据库查询对话标题。 -- 默认字体:英文 Times New Roman,中文宋体/黑体,代码 Consolas。 - -### 依赖 +## 依赖 - `python-docx==1.1.2` - Word 文档生成 -- `Pygments>=2.15.0` - 语法高亮(可选但建议安装) +- `Pygments>=2.15.0` - 语法高亮 +- `latex2mathml` - LaTeX 转 MathML +- `mathml2omml` - MathML 转 Office Math (OMML) -两者已在插件文档字符串中声明,请确保环境已安装。 +所有依赖已在插件文档字符串中声明。 ## 字体配置 @@ -63,6 +73,18 @@ - **中文文本**:宋体(正文)、黑体(标题) - **代码**:Consolas +## 更新日志 + +### v0.2.0 +- 新增原生数学公式支持(LaTeX → OMML) +- 新增 Mermaid 图表渲染 +- 新增引用与参考资料章节生成 +- 新增自动移除 AI 思考块 +- 增强表格格式(智能列宽、对齐) + +### v0.1.1 +- 初始版本,支持基本 Markdown 转 Word + ## 作者 Fu-Jie diff --git a/plugins/actions/export_to_docx/export_to_word.py b/plugins/actions/export_to_docx/export_to_word.py index ec35c6c..ad60b0d 100644 --- a/plugins/actions/export_to_docx/export_to_word.py +++ b/plugins/actions/export_to_docx/export_to_word.py @@ -3,10 +3,10 @@ title: Export to Word author: Fu-Jie author_url: https://github.com/Fu-Jie funding_url: https://github.com/Fu-Jie/awesome-openwebui -version: 0.1.1 +version: 0.2.0 icon_url: data:image/svg+xml;base64,PHN2ZwogIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIKICB3aWR0aD0iMjQiCiAgaGVpZ2h0PSIyNCIKICB2aWV3Qm94PSIwIDAgMjQgMjQiCiAgZmlsbD0ibm9uZSIKICBzdHJva2U9ImN1cnJlbnRDb2xvciIKICBzdHJva2Utd2lkdGg9IjIiCiAgc3Ryb2tlLWxpbmVjYXA9InJvdW5kIgogIHN0cm9rZS1saW5lam9pbj0icm91bmQiCj4KICA8cGF0aCBkPSJNNiAyMmEyIDIgMCAwIDEtMi0yVjRhMiAyIDAgMCAxIDItMmg4YTIuNCAyLjQgMCAwIDEgMS43MDQuNzA2bDMuNTg4IDMuNTg4QTIuNCAyLjQgMCAwIDEgMjAgOHYxMmEyIDIgMCAwIDEtMiAyeiIgLz4KICA8cGF0aCBkPSJNMTQgMnY1YTEgMSAwIDAgMCAxIDFoNSIgLz4KICA8cGF0aCBkPSJNMTAgOUg4IiAvPgogIDxwYXRoIGQ9Ik0xNiAxM0g4IiAvPgogIDxwYXRoIGQ9Ik0xNiAxN0g4IiAvPgo8L3N2Zz4K -requirements: python-docx==1.1.2, Pygments>=2.15.0 -description: Export current conversation from Markdown to Word (.docx) file with syntax highlighting and blockquote support. +requirements: python-docx==1.1.2, latex2mathml, mathml2omml +description: Export conversation to Word (.docx) with syntax highlighting, native math equations (LaTeX), Mermaid diagrams, citations, and enhanced table formatting. """ import os @@ -16,18 +16,32 @@ import datetime import io import asyncio import logging -from typing import Optional, Callable, Awaitable, Any, List, Tuple +from typing import ( + Optional, + Callable, + Awaitable, + Any, + List, + Tuple, + Union, + Dict, + Literal, + cast, +) from docx import Document from docx.shared import Pt, Inches, RGBColor, Cm from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_LINE_SPACING from docx.enum.table import WD_TABLE_ALIGNMENT from docx.enum.style import WD_STYLE_TYPE +from docx.opc.constants import RELATIONSHIP_TYPE as RT from docx.oxml.ns import qn -from docx.oxml import OxmlElement +from docx.oxml import OxmlElement, parse_xml from open_webui.models.chats import Chats from open_webui.models.users import Users from open_webui.utils.chat import generate_chat_completion from pydantic import BaseModel, Field +from dataclasses import dataclass + # Pygments for syntax highlighting try: @@ -39,6 +53,23 @@ try: except ImportError: PYGMENTS_AVAILABLE = False +try: + from latex2mathml.converter import convert as latex_to_mathml + import mathml2omml + + LATEX_MATH_AVAILABLE = True +except Exception: + LATEX_MATH_AVAILABLE = False + +_REASONING_DETAILS_RE = re.compile( + r"]*\btype\s*=\s*(?:\"reasoning\"|'reasoning'|reasoning)[^>]*>.*?", + re.IGNORECASE | re.DOTALL, +) +_THINK_RE = re.compile(r"]*>.*?", re.IGNORECASE | re.DOTALL) +_ANALYSIS_RE = re.compile( + r"]*>.*?", re.IGNORECASE | re.DOTALL +) + logging.basicConfig( level=logging.INFO, @@ -47,15 +78,69 @@ logging.basicConfig( logger = logging.getLogger(__name__) +@dataclass(frozen=True) +class _CitationRef: + idx: int + anchor: str + title: str + url: Optional[str] + source_id: str + + +@dataclass +class _MermaidFenceBlock: + info_raw: str + language: str + attrs: List[str] + source: str + + class Action: class Valves(BaseModel): TITLE_SOURCE: str = Field( default="chat_title", description="Title Source: 'chat_title' (Chat Title), 'ai_generated' (AI Generated), 'markdown_title' (Markdown Title)", ) + MERMAID_JS_URL: str = Field( + default="https://cdn.jsdelivr.net/npm/mermaid@10.9.1/dist/mermaid.min.js", + description="Mermaid JS CDN URL", + ) + MERMAID_JSZIP_URL: str = Field( + default="https://cdnjs.cloudflare.com/ajax/libs/jszip/3.10.1/jszip.min.js", + description="JSZip CDN URL (for DOCX manipulation)", + ) + MERMAID_OPTIMIZE_LAYOUT: bool = Field( + default=True, + description="Optimize Mermaid Layout: Automatically convert LR (Left-Right) to TD (Top-Down) for better fit.", + ) + MERMAID_PNG_SCALE: float = Field( + default=3.0, + description="Mermaid PNG Scale (Resolution): Higher = clearer but larger file size. Default: 3.0", + ) + MERMAID_DISPLAY_SCALE: float = Field( + default=1.5, + description="Mermaid Display Scale (Visual Size): >1.0 to enlarge, <1.0 to shrink. Default: 1.5", + ) + MERMAID_CAPTIONS_ENABLE: bool = Field( + default=True, + description="Enable Mermaid Captions", + ) + MERMAID_CAPTION_STYLE: str = Field( + default="Caption", + description="Mermaid Caption Style Name", + ) + MERMAID_CAPTION_PREFIX: str = Field( + default="Figure", + description="Mermaid Caption Prefix", + ) def __init__(self): self.valves = self.Valves() + self._mermaid_figure_counter = 0 + self._caption_style_name = "" + self._citation_anchor_by_index: Dict[int, str] = {} + self._citation_refs: List[_CitationRef] = [] + self._bookmark_id_counter: int = 1 async def _send_notification(self, emitter: Callable, type: str, content: str): await emitter( @@ -157,8 +242,17 @@ class Action: # Create Word document; if no h1 exists, inject chat title as h1 has_h1 = bool(re.search(r"^#\s+.+$", message_content, re.MULTILINE)) + + # Extract sources if available (for citations) + sources = ( + last_assistant_message.get("sources") or body.get("sources") or [] + ) + doc = self.markdown_to_docx( - message_content, top_heading=top_heading, has_h1=has_h1 + message_content, + top_heading=top_heading, + has_h1=has_h1, + sources=sources, ) # Save to memory @@ -168,6 +262,13 @@ class Action: file_content = doc_buffer.read() base64_blob = base64.b64encode(file_content).decode("utf-8") + # Escape message_content for JavaScript template literal + escaped_content = ( + message_content.replace("\\", "\\\\") # Escape backslashes first + .replace("`", "\\`") # Escape backticks + .replace("${", "\\${") # Escape template literal expressions + ) + # Trigger file download if __event_call__: await __event_call__( @@ -175,28 +276,197 @@ class Action: "type": "execute", "data": { "code": f""" - try {{ - const base64Data = "{base64_blob}"; - const binaryData = atob(base64Data); - const arrayBuffer = new Uint8Array(binaryData.length); - for (let i = 0; i < binaryData.length; i++) {{ - arrayBuffer[i] = binaryData.charCodeAt(i); - }} - const blob = new Blob([arrayBuffer], {{ type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document" }}); - const filename = "{filename}"; + (async function() {{ + try {{ + // Parse document.xml to find placeholders and extract optimized code + // We do this FIRST to get the actual code to render (which might have been optimized in Python) + + // Load JSZip + if (!window.JSZip) {{ + await new Promise((resolve, reject) => {{ + const script = document.createElement("script"); + script.src = "{self.valves.MERMAID_JSZIP_URL}"; + script.onload = resolve; + script.onerror = reject; + document.head.appendChild(script); + }}); + }} - const url = URL.createObjectURL(blob); - const a = document.createElement("a"); - a.style.display = "none"; - a.href = url; - a.download = filename; - document.body.appendChild(a); - a.click(); - URL.revokeObjectURL(url); - document.body.removeChild(a); - }} catch (error) {{ - console.error('Error triggering download:', error); - }} + const base64Data = "{base64_blob}"; + const binaryData = atob(base64Data); + const arrayBuffer = new Uint8Array(binaryData.length); + for (let i = 0; i < binaryData.length; i++) {{ + arrayBuffer[i] = binaryData.charCodeAt(i); + }} + + const zip = new JSZip(); + await zip.loadAsync(arrayBuffer); + + // Parse document.xml + const docXml = await zip.file("word/document.xml").async("string"); + const parser = new DOMParser(); + const xmlDoc = parser.parseFromString(docXml, "application/xml"); + + const drawings = xmlDoc.getElementsByTagName("w:drawing"); + const placeholderInfo = []; + + for (let i = 0; i < drawings.length; i++) {{ + const drawing = drawings[i]; + const docPr = drawing.getElementsByTagName("wp:docPr")[0]; + if (docPr) {{ + const descr = docPr.getAttribute("descr"); + if (descr && descr.startsWith("MERMAID_SRC:")) {{ + const encodedCode = descr.substring("MERMAID_SRC:".length); + const code = decodeURIComponent(encodedCode); + + // Find the blip and extent to replace + const parent = drawing.parentNode; // w:r usually, or w:drawing parent + // We need to find a:blip and wp:extent within this drawing + const blip = drawing.getElementsByTagName("a:blip")[0]; + const extent = drawing.getElementsByTagName("wp:extent")[0]; + + if (blip && extent) {{ + const rId = blip.getAttribute("r:embed"); + placeholderInfo.push({{ rId, extent, code }}); + }} + }} + }} + }} + + if (placeholderInfo.length === 0) {{ + console.log("No Mermaid placeholders found in DOCX."); + // Just download the file as is + const blob = new Blob([arrayBuffer], {{type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document"}}); + const url = URL.createObjectURL(blob); + const a = document.createElement("a"); + a.style.display = "none"; + a.href = url; + a.download = "{filename}"; + document.body.appendChild(a); + a.click(); + URL.revokeObjectURL(url); + document.body.removeChild(a); + return; + }} + + console.log(`Found ${{placeholderInfo.length}} Mermaid placeholders.`); + + // Load Mermaid + if (!window.mermaid) {{ + await new Promise((resolve, reject) => {{ + const script = document.createElement("script"); + script.src = "{self.valves.MERMAID_JS_URL}"; + script.onload = resolve; + script.onerror = reject; + document.head.appendChild(script); + }}); + }} + + mermaid.initialize({{ + startOnLoad: false, + theme: 'default', + }}); + + // Read rels XML once + const relsXml = await zip.file("word/_rels/document.xml.rels").async("string"); + const relsDoc = parser.parseFromString(relsXml, "application/xml"); + const relationships = relsDoc.getElementsByTagName("Relationship"); + const rIdToPath = {{}}; + + for (let i = 0; i < relationships.length; i++) {{ + const rel = relationships[i]; + rIdToPath[rel.getAttribute("Id")] = rel.getAttribute("Target"); + }} + + // Render and replace + console.log(`Processing ${{placeholderInfo.length}} diagrams...`); + + for (let i = 0; i < placeholderInfo.length; i++) {{ + const {{ rId, extent, code }} = placeholderInfo[i]; + const imagePath = "word/" + rIdToPath[rId]; + + console.log(`Block ${{i + 1}}/${{placeholderInfo.length}}: Rendering and replacing at ${{imagePath}}`); + + // Render SVG + const id = "mermaid-export-" + i; + const {{ svg }} = await mermaid.render(id, code); + + // Convert SVG to PNG + const canvas = document.createElement("canvas"); + const ctx = canvas.getContext("2d"); + const img = new Image(); + + // Get SVG dimensions + const svgMatch = svg.match(/viewBox="[^"]*\s+[^"]*\s+([^"\s]+)\s+([^"\s]+)"/); + let width = 800; + let height = 600; + if (svgMatch) {{ + width = parseFloat(svgMatch[1]); + height = parseFloat(svgMatch[2]); + }} + + // Scale up for better quality + const scale = {self.valves.MERMAID_PNG_SCALE}; + canvas.width = width * scale; + canvas.height = height * scale; + + await new Promise((resolve, reject) => {{ + img.onload = resolve; + img.onerror = reject; + img.src = "data:image/svg+xml;base64," + btoa(unescape(encodeURIComponent(svg))); + }}); + + ctx.scale(scale, scale); + ctx.drawImage(img, 0, 0, width, height); + + const pngDataUrl = canvas.toDataURL("image/png"); + const pngBase64 = pngDataUrl.split(",")[1]; + + // Replace image in ZIP + zip.file(imagePath, pngBase64, {{base64: true}}); + + // Update dimensions in document.xml (EMUs) + // 1 inch = 914400 EMUs, 1 pixel ≈ 9525 EMUs at 96 DPI + // Max width: ~6 inches (page width minus margins) + const maxWidthEmu = 5486400; // 6 inches + const displayScale = {self.valves.MERMAID_DISPLAY_SCALE}; + let emuWidth = Math.round(width * 9525 * displayScale); + let emuHeight = Math.round(height * 9525 * displayScale); + + // Scale down if too wide + if (emuWidth > maxWidthEmu) {{ + const scaleFactor = maxWidthEmu / emuWidth; + emuWidth = maxWidthEmu; + emuHeight = Math.round(emuHeight * scaleFactor); + }} + + extent.setAttribute("cx", emuWidth); + extent.setAttribute("cy", emuHeight); + }} + + // Serialize updated XML + const serializer = new XMLSerializer(); + const newDocXml = serializer.serializeToString(xmlDoc); + zip.file("word/document.xml", newDocXml); + + // Generate final blob + const finalBlob = await zip.generateAsync({{type: "blob"}}); + const filename = "{filename}"; + + const url = URL.createObjectURL(finalBlob); + const a = document.createElement("a"); + a.style.display = "none"; + a.href = url; + a.download = filename; + document.body.appendChild(a); + a.click(); + URL.revokeObjectURL(url); + document.body.removeChild(a); + }} catch (error) {{ + console.error('Error triggering download:', error); + alert('Export failed: ' + error.message); + }} + }})(); """ }, } @@ -335,22 +605,367 @@ class Action: title = data.get("title") or getattr(chat, "title", "") return title.strip() if isinstance(title, str) else "" + def _strip_reasoning_blocks(self, text: str) -> str: + """ + Strip model reasoning blocks from assistant Markdown before export. + """ + if not text: + return text + + cur = text + for _ in range(10): + prev = cur + cur = _REASONING_DETAILS_RE.sub("", cur) + cur = _THINK_RE.sub("", cur) + cur = _ANALYSIS_RE.sub("", cur) + if cur == prev: + break + + # Clean up excessive blank lines left by removals. + cur = re.sub(r"\n{4,}", "\n\n\n", cur) + return cur + + def _add_display_equation(self, doc: Document, latex: str): + latex = (latex or "").strip() + if not latex: + return + + if not LATEX_MATH_AVAILABLE: + self.add_code_block(doc, latex, "latex") + return + + try: + mathml = latex_to_mathml(latex) + omml = mathml2omml.convert(mathml) + para = doc.add_paragraph() + para.alignment = WD_ALIGN_PARAGRAPH.CENTER + cast(Any, para)._p.append(self._wrap_omml_for_word(omml)) + except Exception as exc: + logger.warning(f"Math conversion failed; falling back to text: {exc}") + self.add_code_block(doc, latex, "latex") + + def _wrap_omml_for_word(self, omml: str): + m_ns = "http://schemas.openxmlformats.org/officeDocument/2006/math" + w_ns = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + # Keep the OMML payload as-is, but ensure it has the math namespace declared. + xml = f'{omml}' + return parse_xml(xml) + + def _add_inline_equation( + self, + paragraph, + latex: str, + bold: bool = False, + italic: bool = False, + strike: bool = False, + ): + latex = (latex or "").strip() + if not latex: + return + + if not LATEX_MATH_AVAILABLE: + self._add_text_run( + paragraph, f"\\({latex}\\)", bold=bold, italic=italic, strike=strike + ) + return + + try: + mathml = latex_to_mathml(latex) + omml = mathml2omml.convert(mathml) + o_math = self._omml_oMath_element(omml) + run = paragraph.add_run() + run.bold = bold + run.italic = italic + run.font.strike = strike + cast(Any, run)._r.append(o_math) + except Exception as exc: + logger.warning(f"Inline math conversion failed; keeping literal: {exc}") + self._add_text_run( + paragraph, f"\\({latex}\\)", bold=bold, italic=italic, strike=strike + ) + + def _omml_oMath_element(self, omml: str): + # Ensure the OMML element declares the math namespace so parse_xml works. + m_ns = "http://schemas.openxmlformats.org/officeDocument/2006/math" + s = (omml or "").strip() + if s.startswith("") and s.endswith(""): + inner = s[len("") : -len("")] + s = f'{inner}' + elif s.startswith("", 1)[0]: + s = s.replace(" List[_CitationRef]: + citation_idx_map: Dict[str, int] = {} + refs_by_idx: Dict[int, _CitationRef] = {} + + for source in sources or []: + if not isinstance(source, dict): + continue + + documents = source.get("document") or [] + metadatas = source.get("metadata") or [] + src_info = source.get("source") or {} + + src_name = src_info.get("name") if isinstance(src_info, dict) else None + src_id_default = src_info.get("id") if isinstance(src_info, dict) else None + src_urls = src_info.get("urls") if isinstance(src_info, dict) else None + + if not isinstance(documents, list): + documents = [] + if not isinstance(metadatas, list): + metadatas = [] + + for idx_doc, _doc_text in enumerate(documents): + meta = metadatas[idx_doc] if idx_doc < len(metadatas) else {} + if not isinstance(meta, dict): + meta = {} + + source_id = meta.get("source") or src_id_default or "N/A" + source_id_str = str(source_id) + + if source_id_str not in citation_idx_map: + citation_idx_map[source_id_str] = len(citation_idx_map) + 1 + idx = citation_idx_map[source_id_str] + + if idx in refs_by_idx: + continue + + url: Optional[str] = None + if isinstance(source_id, str) and re.match(r"^https?://", source_id): + url = source_id + elif isinstance(meta.get("url"), str) and re.match( + r"^https?://", meta["url"] + ): + url = meta["url"] + elif isinstance(src_urls, list) and src_urls: + if isinstance(src_urls[0], str) and re.match( + r"^https?://", src_urls[0] + ): + url = src_urls[0] + + title = ( + (meta.get("title") if isinstance(meta.get("title"), str) else None) + or (meta.get("name") if isinstance(meta.get("name"), str) else None) + or ( + src_name + if isinstance(src_name, str) and src_name.strip() + else None + ) + or (url if url else None) + or source_id_str + ) + + anchor = f"OWUIRef{idx}" + refs_by_idx[idx] = _CitationRef( + idx=idx, + anchor=anchor, + title=title, + url=url, + source_id=source_id_str, + ) + + return [refs_by_idx[i] for i in sorted(refs_by_idx.keys())] + + def _add_bookmark(self, paragraph, name: str): + bookmark_id = self._bookmark_id_counter + self._bookmark_id_counter += 1 + + start = OxmlElement("w:bookmarkStart") + start.set(qn("w:id"), str(bookmark_id)) + start.set(qn("w:name"), name) + + end = OxmlElement("w:bookmarkEnd") + end.set(qn("w:id"), str(bookmark_id)) + + p = cast(Any, paragraph)._p + p.insert(0, start) + p.append(end) + + def _add_internal_hyperlink(self, paragraph, display_text: str, anchor: str): + hyperlink = OxmlElement("w:hyperlink") + hyperlink.set(qn("w:anchor"), anchor) + + new_run = OxmlElement("w:r") + rPr = OxmlElement("w:rPr") + rStyle = OxmlElement("w:rStyle") + rStyle.set(qn("w:val"), "Hyperlink") + rPr.append(rStyle) + + new_run.append(rPr) + t = OxmlElement("w:t") + t.text = display_text + new_run.append(t) + + hyperlink.append(new_run) + cast(Any, paragraph)._p.append(hyperlink) + + def _add_references_section(self, doc: Document): + self.add_heading(doc, "References", 2) + + for ref in self._citation_refs: + para = doc.add_paragraph(style="List Number") + self._add_bookmark(para, ref.anchor) + # Include URL as an external link when available. + if ref.url: + self._add_hyperlink(para, ref.title, ref.url, display_text=ref.title) + else: + self._add_text_run( + para, ref.title, bold=False, italic=False, strike=False + ) + + def _normalize_url(self, url: str) -> str: + u = (url or "").strip() + if u.lower().startswith("www."): + u = "https://" + u + + # Trim common trailing punctuation that often follows URLs in prose. + while u and u[-1] in ".,;:!?)]}": + u = u[:-1] + return u + + def _add_hyperlink( + self, paragraph, text: str, url: str, display_text: Optional[str] = None + ): + u = self._normalize_url(url) + if not u: + paragraph.add_run(display_text or text) + return + + part = getattr(paragraph, "part", None) + if part is None or not hasattr(part, "relate_to"): + # Fallback if relationship API isn't available. + run = paragraph.add_run(display_text or text) + run.font.color.rgb = RGBColor(0, 0, 255) + run.font.underline = True + return + + r_id = part.relate_to(u, RT.HYPERLINK, is_external=True) + + hyperlink = OxmlElement("w:hyperlink") + hyperlink.set(qn("r:id"), r_id) + + new_run = OxmlElement("w:r") + rPr = OxmlElement("w:rPr") + rStyle = OxmlElement("w:rStyle") + rStyle.set(qn("w:val"), "Hyperlink") + rPr.append(rStyle) + + color = OxmlElement("w:color") + color.set(qn("w:val"), "0000FF") + rPr.append(color) + + u_el = OxmlElement("w:u") + u_el.set(qn("w:val"), "single") + rPr.append(u_el) + + new_run.append(rPr) + + t = OxmlElement("w:t") + t.text = display_text or text + new_run.append(t) + + hyperlink.append(new_run) + cast(Any, paragraph)._p.append(hyperlink) + + def _add_text_run(self, paragraph, s: str, bold: bool, italic: bool, strike: bool): + if not s: + return + run = paragraph.add_run(s) + if bold: + run.bold = True + if italic: + run.italic = True + if strike: + run.font.strike = True + + # Set Chinese font (copying from existing add_paragraph logic) + run.font.name = "Times New Roman" + run._element.rPr.rFonts.set(qn("w:eastAsia"), "SimSun") + + def _add_inline_code(self, paragraph, s: str): + if s == "": + return + + # Simple inline code without URL parsing for now, or copy full logic if needed. + # For now, just basic styling to match existing. + run = paragraph.add_run(s) + run.font.name = "Consolas" + run._element.rPr.rFonts.set(qn("w:eastAsia"), "SimHei") + run.font.size = Pt(10) + shading = OxmlElement("w:shd") + shading.set(qn("w:fill"), "E8E8E8") + run._element.rPr.append(shading) + + def _add_hyperlink_code(self, paragraph, display_text: str, url: str): + u = self._normalize_url(url) + if not u: + self._add_inline_code(paragraph, display_text) + return + + part = getattr(paragraph, "part", None) + if part is None or not hasattr(part, "relate_to"): + self._add_inline_code(paragraph, display_text) + return + + r_id = part.relate_to(u, RT.HYPERLINK, is_external=True) + + hyperlink = OxmlElement("w:hyperlink") + hyperlink.set(qn("r:id"), r_id) + + new_run = OxmlElement("w:r") + rPr = OxmlElement("w:rPr") + + rFonts = OxmlElement("w:rFonts") + rFonts.set(qn("w:ascii"), "Consolas") + rFonts.set(qn("w:hAnsi"), "Consolas") + rFonts.set(qn("w:eastAsia"), "SimHei") + rPr.append(rFonts) + + sz = OxmlElement("w:sz") + sz.set(qn("w:val"), "20") # 10pt + rPr.append(sz) + + shading = OxmlElement("w:shd") + shading.set(qn("w:fill"), "E8E8E8") + rPr.append(shading) + + new_run.append(rPr) + + t = OxmlElement("w:t") + t.text = display_text + new_run.append(t) + + hyperlink.append(new_run) + cast(Any, paragraph)._p.append(hyperlink) + def clean_filename(self, name: str) -> str: """Clean illegal characters from filename""" return re.sub(r'[\\/*?:"<>|]', "", name).strip()[:50] def markdown_to_docx( - self, markdown_text: str, top_heading: str = "", has_h1: bool = False + self, + markdown_text: str, + top_heading: str = "", + has_h1: bool = False, + sources: Optional[List[dict]] = None, ) -> Document: """ Convert Markdown text to Word document - Supports: headings, paragraphs, bold, italic, code blocks, lists, tables, links + Supports: headings, paragraphs, bold, italic, code blocks, lists, tables, links, + native math, citations, and stripped reasoning. """ doc = Document() # Set default fonts self.set_document_default_font(doc) + # Build citation references + self._citation_refs = self._build_citation_refs(sources) + + # Strip reasoning blocks + markdown_text = self._strip_reasoning_blocks(markdown_text) + # If there is no h1 in content, prepend chat title as h1 when provided if top_heading and not has_h1: self.add_heading(doc, top_heading, 1) @@ -377,14 +992,58 @@ class Action: in_list = False in_code_block = True - code_block_lang = line.strip()[3:].strip() + code_block_info_raw = line.strip()[3:].strip() + code_block_lang, code_block_attrs = self._parse_fence_info( + code_block_info_raw + ) code_block_content = [] else: # End code block in_code_block = False - self.add_code_block( - doc, "\n".join(code_block_content), code_block_lang - ) + code_text = "\n".join(code_block_content) + + # Check for Mermaid or Flowchart + mermaid_langs = { + "mermaid", + "flowchart", + "sequence", + "gantt", + "class", + "state", + "pie", + "er", + "journey", + "gitgraph", + "mindmap", + } + + if code_block_lang.lower() in mermaid_langs: + # Create Mermaid Block Object + block = _MermaidFenceBlock( + info_raw=code_block_info_raw, + language=code_block_lang, + attrs=code_block_attrs, + source=code_text, + ) + # Handle Mermaid diagram + if code_block_lang == "mermaid": + # Optimize layout if enabled + if self.valves.MERMAID_OPTIMIZE_LAYOUT: + # Replace LR with TD for graph and flowchart + code_text = re.sub( + r"^(graph|flowchart)\s+LR\b", + r"\1 TD", + code_text, + flags=re.MULTILINE | re.IGNORECASE, + ) + + self._insert_mermaid_placeholder(doc, code_text) + else: + # Insert Placeholder using the block object + self._insert_mermaid_placeholder(doc, block) + else: + self.add_code_block(doc, code_text, code_block_lang) + code_block_content = [] code_block_lang = "" i += 1 @@ -395,6 +1054,66 @@ class Action: i += 1 continue + # Handle Math Blocks: $$...$$ or \[...\] + # Simple detection: if line starts with $$ or \[, treat as math block start + stripped_line = line.strip() + if stripped_line.startswith("$$") or stripped_line.startswith("\\["): + # Process pending list first + if in_list and list_items: + self.add_list_to_doc(doc, list_items, list_type) + list_items = [] + in_list = False + + # Check if it's a single-line block like $$ E=mc^2 $$ + if ( + stripped_line.startswith("$$") + and stripped_line.endswith("$$") + and len(stripped_line) > 2 + ) or ( + stripped_line.startswith("\\[") + and stripped_line.endswith("\\]") + and len(stripped_line) > 2 + ): + # Extract content + if stripped_line.startswith("$$"): + math_content = stripped_line[2:-2] + else: + math_content = stripped_line[2:-2] + self._add_display_equation(doc, math_content) + i += 1 + continue + + # Multi-line math block + math_lines = [] + # Remove opening marker + if stripped_line.startswith("$$"): + current_line_content = stripped_line[2:] + end_marker = "$$" + else: + current_line_content = stripped_line[2:] + end_marker = "\\]" + + if current_line_content.strip(): + math_lines.append(current_line_content) + + i += 1 + block_closed = False + while i < len(lines): + next_line = lines[i] + if next_line.strip().endswith(end_marker): + # Found end + content_before_end = next_line.strip()[: -len(end_marker)] + if content_before_end.strip(): + math_lines.append(content_before_end) + block_closed = True + i += 1 + break + math_lines.append(next_line) + i += 1 + + self._add_display_equation(doc, "\n".join(math_lines)) + continue + # Handle tables if line.strip().startswith("|") and line.strip().endswith("|"): # Process pending list first @@ -506,6 +1225,10 @@ class Action: if in_list and list_items: self.add_list_to_doc(doc, list_items, list_type) + # Add References Section if citations exist + if self._citation_refs: + self._add_references_section(doc) + return doc def set_document_default_font(self, doc: Document): @@ -552,10 +1275,16 @@ class Action: def add_formatted_text(self, paragraph, text: str): """ Parse Markdown inline formatting and add to paragraph - Supports: bold, italic, inline code, links, strikethrough + Supports: bold, italic, inline code, links, strikethrough, inline math, citations """ # Define formatting patterns patterns = [ + # Inline Math \( ... \) + (r"\\\((.+?)\\\)", {"math": True}), + # Inline Math $...$ (single dollar signs, non-greedy) + (r"(? pos: plain_text = text[pos : match["start"]] if plain_text: - paragraph.add_run(plain_text) + self._add_text_run(paragraph, plain_text, False, False, False) # Add formatted text style = match["style"] run_text = match["text"] - if style.get("link"): + if style.get("math"): + self._add_inline_equation(paragraph, run_text) + elif style.get("citation"): + idx = int(run_text) + # Find the anchor for this index + ref = next((r for r in self._citation_refs if r.idx == idx), None) + if ref: + self._add_internal_hyperlink(paragraph, f"[{idx}]", ref.anchor) + else: + self._add_text_run(paragraph, f"[{idx}]", False, False, False) + elif style.get("link"): # Link handling - run = paragraph.add_run(run_text) - run.font.color.rgb = RGBColor(0, 0, 255) - run.font.underline = True + self._add_hyperlink(paragraph, run_text, match["url"]) elif style.get("code"): # Inline code - run = paragraph.add_run(run_text) - run.font.name = "Consolas" - run._element.rPr.rFonts.set(qn("w:eastAsia"), "SimHei") - run.font.size = Pt(10) - # Add background color - shading = OxmlElement("w:shd") - shading.set(qn("w:fill"), "E8E8E8") - run._element.rPr.append(shading) + self._add_inline_code(paragraph, run_text) else: - run = paragraph.add_run(run_text) - if style.get("bold"): - run.bold = True - if style.get("italic"): - run.italic = True - if style.get("strike"): - run.font.strike = True + # For bold/italic/strike, check if the text contains inline math + # Pattern for inline math: \(...\) or $...$ + math_pattern = r"(\\\((.+?)\\\)|\$([^$]+?)\$)" + math_matches = list(re.finditer(math_pattern, run_text)) + + if math_matches: + # Process text with inline math + text_pos = 0 + for math_match in math_matches: + # Add text before math + if math_match.start() > text_pos: + before_text = run_text[text_pos : math_match.start()] + self._add_text_run( + paragraph, + before_text, + bold=style.get("bold", False), + italic=style.get("italic", False), + strike=style.get("strike", False), + ) + # Add inline equation with formatting + latex_content = math_match.group(2) or math_match.group(3) + self._add_inline_equation( + paragraph, + latex_content, + bold=style.get("bold", False), + italic=style.get("italic", False), + strike=style.get("strike", False), + ) + text_pos = math_match.end() + # Add remaining text after last math + if text_pos < len(run_text): + self._add_text_run( + paragraph, + run_text[text_pos:], + bold=style.get("bold", False), + italic=style.get("italic", False), + strike=style.get("strike", False), + ) + else: + self._add_text_run( + paragraph, + run_text, + bold=style.get("bold", False), + italic=style.get("italic", False), + strike=style.get("strike", False), + ) pos = match["end"] # Add remaining plain text if pos < len(text): - paragraph.add_run(text[pos:]) + self._add_text_run(paragraph, text[pos:], False, False, False) def add_code_block(self, doc: Document, code: str, language: str = ""): """Add code block with syntax highlighting""" @@ -736,75 +1517,264 @@ class Action: run._element.rPr.rFonts.set(qn("w:eastAsia"), "SimHei") run.font.size = Pt(10) + def _insert_mermaid_placeholder( + self, doc: Document, block: Union[_MermaidFenceBlock, str] + ): + self._mermaid_figure_counter += 1 + if isinstance(block, str): + code = block + else: + code = block.source + + # Create unique transparent PNG for each placeholder + # By varying image dimensions, we ensure python-docx doesn't reuse the same image file + # Use figure_counter to create different sizes (1x1, 1x2, 1x3, ...) + from PIL import Image + + # Create a transparent image with size 1 x counter (ensures each is unique) + img = Image.new("RGBA", (1, self._mermaid_figure_counter), (0, 0, 0, 0)) + image_stream = io.BytesIO() + img.save(image_stream, format="PNG") + image_stream.seek(0) + + # Add paragraph with center alignment + paragraph = doc.add_paragraph() + paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER + run = paragraph.add_run() + + # Add picture (default size, will be resized by JS) + # We set a small size initially + picture = run.add_picture(image_stream, width=Inches(1)) + + # Set Alt Text (Description) to "MERMAID_SRC:" + # This is the magic link between Python and JS + import urllib.parse + + encoded_code = urllib.parse.quote(code) + + # Access the underlying XML to set docPr descr + # picture is an InlineShape, but run.add_picture returns an InlineShape proxy + # We need to get the wp:docPr element + + inline = picture._inline + docPr = inline.docPr + + # Use .set() to ensure attributes are written to XML + docPr.set("descr", f"MERMAID_SRC:{encoded_code}") + docPr.set("title", "Mermaid Diagram Placeholder") + + # Add Caption + if self.valves.MERMAID_CAPTIONS_ENABLE: + self._add_mermaid_caption(doc, self._mermaid_figure_counter) + + def _add_mermaid_caption(self, doc: Document, figure_number: int): + if not self._caption_style_name: + self._ensure_caption_style(doc) + self._caption_style_name = self.valves.MERMAID_CAPTION_STYLE + + caption_text = f"{self.valves.MERMAID_CAPTION_PREFIX} {figure_number}" + paragraph = doc.add_paragraph(caption_text, style=self._caption_style_name) + paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER + paragraph.paragraph_format.keep_with_next = False + + def _ensure_caption_style(self, doc: Document): + style_name = self.valves.MERMAID_CAPTION_STYLE + styles = doc.styles + if style_name not in styles: + style = styles.add_style(style_name, WD_STYLE_TYPE.PARAGRAPH) + style.base_style = styles["Normal"] + style.next_paragraph_style = styles["Normal"] + font = style.font + font.name = "Times New Roman" + font.size = Pt(10) + font.italic = True + font.color.rgb = RGBColor(0x55, 0x55, 0x55) # Dark Grey + + def _parse_fence_info(self, info_raw: str) -> Tuple[str, List[str]]: + parts = info_raw.split() + if not parts: + return "", [] + lang = parts[0] + attrs = parts[1:] + return lang, attrs + def add_table(self, doc: Document, table_lines: List[str]): - """Add table with header shading and zebra striping""" + """Add Markdown table with smart sizing, alignment, and hyperlinks/math support in cells.""" if len(table_lines) < 2: return + header_fill = "F2F2F2" + zebra_fill = "FBFBFB" + + def _split_row(line: str) -> List[str]: + # Keep empty cells, trim surrounding pipes. + raw = line.strip().strip("|") + return [c.strip() for c in raw.split("|")] + + def _is_separator_row(cells: List[str]) -> bool: + # Markdown separator: --- / :--- / ---: / :---: + if not cells: + return False + ok = 0 + for c in cells: + c = c.strip() + if re.fullmatch(r":?-{3,}:?", c): + ok += 1 + return ok == len(cells) + + def _col_align(cell: str) -> WD_ALIGN_PARAGRAPH: + s = (cell or "").strip() + if s.startswith(":") and s.endswith(":"): + return WD_ALIGN_PARAGRAPH.CENTER + if s.endswith(":"): + return WD_ALIGN_PARAGRAPH.RIGHT + return WD_ALIGN_PARAGRAPH.LEFT + def _set_cell_shading(cell, fill: str): tc_pr = cell._element.get_or_add_tcPr() shd = OxmlElement("w:shd") shd.set(qn("w:fill"), fill) tc_pr.append(shd) - header_fill = "F2F2F2" - zebra_fill = "FBFBFB" - - # Parse table data - rows = [] - for line in table_lines: - cells = [cell.strip() for cell in line.strip().strip("|").split("|")] - # Skip separator row - if all(re.fullmatch(r"[-:]+", cell) for cell in cells): - continue - rows.append(cells) - - if not rows: + raw_rows = [_split_row(l) for l in table_lines if l.strip().startswith("|")] + if not raw_rows: return - # Determine column count - num_cols = max(len(row) for row in rows) + sep_idx = 1 if len(raw_rows) > 1 and _is_separator_row(raw_rows[1]) else -1 + header = raw_rows[0] + body = raw_rows[sep_idx + 1 :] if sep_idx >= 0 else raw_rows[1:] - # Create table - table = doc.add_table(rows=len(rows), cols=num_cols) + num_cols = max(len(header), *(len(r) for r in body)) if body else len(header) + header = header + [""] * (num_cols - len(header)) + body = [r + [""] * (num_cols - len(r)) for r in body] + + aligns = [ + _col_align(c) for c in (raw_rows[1] if sep_idx == 1 else [""] * num_cols) + ] + + table = doc.add_table(rows=1 + len(body), cols=num_cols) table.style = "Table Grid" - table.alignment = WD_TABLE_ALIGNMENT.CENTER + table.alignment = WD_TABLE_ALIGNMENT.LEFT + cast(Any, table).autofit = False - # Fill table - for row_idx, row_data in enumerate(rows): - row = table.rows[row_idx] - for col_idx, cell_text in enumerate(row_data): - if col_idx < num_cols: - cell = row.cells[col_idx] - # Clear default paragraph - cell.paragraphs[0].clear() - para = cell.paragraphs[0] - para.paragraph_format.space_after = Pt(3) - para.paragraph_format.space_before = Pt(1) - para.alignment = WD_ALIGN_PARAGRAPH.LEFT + # Cell margins (twips): smaller padding for compact tables. + self._set_table_cell_margins(table, top=60, bottom=60, left=90, right=90) - self.add_formatted_text(para, cell_text) + # Column widths: proportional to content, bounded, then normalized to page width. + available_width = int(self._available_block_width(doc)) + min_col = max(int(Inches(0.55)), available_width // max(1, num_cols * 3)) - # Set cell font - for run in para.runs: - run.font.name = "Times New Roman" - run._element.rPr.rFonts.set(qn("w:eastAsia"), "SimSun") - run.font.size = Pt(10) + def _plain_len(s: str) -> int: + t = re.sub(r"`([^`]+)`", r"\1", s or "") + t = re.sub(r"\[([^\]]+)\]\(([^)]+)\)", r"\1", t) + t = re.sub(r"\s+", " ", t).strip() + return len(t) - # Header bold + shading - if row_idx == 0: - for run in para.runs: - run.bold = True - _set_cell_shading(cell, header_fill) - # Zebra striping - elif row_idx % 2 == 1: - _set_cell_shading(cell, zebra_fill) + weights: List[int] = [] + for ci in range(num_cols): + max_len = _plain_len(header[ci]) + for r in body: + max_len = max(max_len, _plain_len(r[ci])) + weights.append(max(1, min(max_len, 40))) - # Left align all cells for readability - for row in table.rows: - for cell in row.cells: - for para in cell.paragraphs: - para.alignment = WD_ALIGN_PARAGRAPH.LEFT + sum_w = sum(weights) or 1 + widths = [max(min_col, int(available_width * w / sum_w)) for w in weights] + total = sum(widths) + if total > available_width: + even = max(1, available_width // max(1, num_cols)) + widths = [even] * num_cols + total = sum(widths) + if total < available_width: + rem = available_width - total + order = sorted(range(num_cols), key=lambda i: weights[i], reverse=True) + oi = 0 + while rem > 0 and order: + widths[order[oi % len(order)]] += 1 + rem -= 1 + oi += 1 + + for ci, w in enumerate(widths): + table.columns[ci].width = w + for row in table.rows: + row.cells[ci].width = w + + def _format_cell_paragraph(para, align: WD_ALIGN_PARAGRAPH): + para.alignment = align + pf = para.paragraph_format + pf.space_before = Pt(0) + pf.space_after = Pt(0) + pf.line_spacing_rule = WD_LINE_SPACING.SINGLE + + def _fill_cell(cell, text: str, align: WD_ALIGN_PARAGRAPH, bold: bool = False): + cell.text = "" + parts = [ + p for p in re.split(r"(?:|\\n)", text or "") if p is not None + ] + if not parts: + parts = [""] + for pi, part in enumerate(parts): + para = cell.paragraphs[0] if pi == 0 else cell.add_paragraph() + _format_cell_paragraph(para, align) + self.add_formatted_text(para, part) + for run in para.runs: + run.font.size = Pt(9) + run.font.name = "Times New Roman" + run._element.rPr.rFonts.set(qn("w:eastAsia"), "SimSun") + if bold: + run.bold = True + + # Header row + header_row = table.rows[0] + self._set_table_header_row_repeat(header_row) + for ci in range(num_cols): + cell = header_row.cells[ci] + _set_cell_shading(cell, header_fill) + _fill_cell( + cell, + header[ci], + aligns[ci] if ci < len(aligns) else WD_ALIGN_PARAGRAPH.LEFT, + bold=True, + ) + + # Body rows + for ri, row_data in enumerate(body, start=1): + row = table.rows[ri] + for ci in range(num_cols): + cell = row.cells[ci] + if (ri % 2) == 0: + _set_cell_shading(cell, zebra_fill) + _fill_cell( + cell, + row_data[ci], + aligns[ci] if ci < len(aligns) else WD_ALIGN_PARAGRAPH.LEFT, + ) + + def _available_block_width(self, doc: Document): + section = doc.sections[0] + return section.page_width - section.left_margin - section.right_margin + + def _set_table_cell_margins( + self, table, top: int, bottom: int, left: int, right: int + ): + tbl_pr = cast(Any, table)._tbl.tblPr + tbl_cell_mar = OxmlElement("w:tblCellMar") + for tag, val in ( + ("top", top), + ("bottom", bottom), + ("left", left), + ("right", right), + ): + el = OxmlElement(f"w:{tag}") + el.set(qn("w:w"), str(int(val))) + el.set(qn("w:type"), "dxa") + tbl_cell_mar.append(el) + tbl_pr.append(tbl_cell_mar) + + def _set_table_header_row_repeat(self, row): + tr_pr = row._tr.get_or_add_trPr() + tbl_header = OxmlElement("w:tblHeader") + tbl_header.set(qn("w:val"), "true") + tr_pr.append(tbl_header) def add_list_to_doc( self, doc: Document, items: List[Tuple[int, str]], list_type: str diff --git a/plugins/actions/export_to_docx/export_to_word_cn.py b/plugins/actions/export_to_docx/export_to_word_cn.py index b399599..8eac987 100644 --- a/plugins/actions/export_to_docx/export_to_word_cn.py +++ b/plugins/actions/export_to_docx/export_to_word_cn.py @@ -3,10 +3,10 @@ title: 导出为 Word author: Fu-Jie author_url: https://github.com/Fu-Jie funding_url: https://github.com/Fu-Jie/awesome-openwebui -version: 0.1.1 +version: 0.2.0 icon_url: data:image/svg+xml;base64,PHN2ZwogIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIKICB3aWR0aD0iMjQiCiAgaGVpZ2h0PSIyNCIKICB2aWV3Qm94PSIwIDAgMjQgMjQiCiAgZmlsbD0ibm9uZSIKICBzdHJva2U9ImN1cnJlbnRDb2xvciIKICBzdHJva2Utd2lkdGg9IjIiCiAgc3Ryb2tlLWxpbmVjYXA9InJvdW5kIgogIHN0cm9rZS1saW5lam9pbj0icm91bmQiCj4KICA8cGF0aCBkPSJNNiAyMmEyIDIgMCAwIDEtMi0yVjRhMiAyIDAgMCAxIDItMmg4YTIuNCAyLjQgMCAwIDEgMS43MDQuNzA2bDMuNTg4IDMuNTg4QTIuNCAyLjQgMCAwIDEgMjAgOHYxMmEyIDIgMCAwIDEtMiAyeiIgLz4KICA8cGF0aCBkPSJNMTQgMnY1YTEgMSAwIDAgMCAxIDFoNSIgLz4KICA8cGF0aCBkPSJNMTAgOUg4IiAvPgogIDxwYXRoIGQ9Ik0xNiAxM0g4IiAvPgogIDxwYXRoIGQ9Ik0xNiAxN0g4IiAvPgo8L3N2Zz4K -requirements: python-docx==1.1.2, Pygments>=2.15.0 -description: 将当前对话内容从 Markdown 转换并导出为 Word (.docx) 文件,支持代码语法高亮和引用块。 +requirements: python-docx==1.1.2, Pygments>=2.15.0, latex2mathml, mathml2omml +description: 将对话导出为 Word (.docx),支持代码高亮、原生数学公式 (LaTeX)、Mermaid 图表、引用参考和增强表格格式。 """ import os @@ -16,18 +16,32 @@ import datetime import io import asyncio import logging -from typing import Optional, Callable, Awaitable, Any, List, Tuple +from typing import ( + Optional, + Callable, + Awaitable, + Any, + List, + Tuple, + Union, + Dict, + Literal, + cast, +) from docx import Document from docx.shared import Pt, Inches, RGBColor, Cm from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_LINE_SPACING from docx.enum.table import WD_TABLE_ALIGNMENT from docx.enum.style import WD_STYLE_TYPE +from docx.opc.constants import RELATIONSHIP_TYPE as RT from docx.oxml.ns import qn -from docx.oxml import OxmlElement +from docx.oxml import OxmlElement, parse_xml from open_webui.models.chats import Chats from open_webui.models.users import Users from open_webui.utils.chat import generate_chat_completion from pydantic import BaseModel, Field +from dataclasses import dataclass + # Pygments for syntax highlighting try: @@ -39,6 +53,23 @@ try: except ImportError: PYGMENTS_AVAILABLE = False +try: + from latex2mathml.converter import convert as latex_to_mathml + import mathml2omml + + LATEX_MATH_AVAILABLE = True +except Exception: + LATEX_MATH_AVAILABLE = False + +_REASONING_DETAILS_RE = re.compile( + r"]*\btype\s*=\s*(?:\"reasoning\"|'reasoning'|reasoning)[^>]*>.*?", + re.IGNORECASE | re.DOTALL, +) +_THINK_RE = re.compile(r"]*>.*?", re.IGNORECASE | re.DOTALL) +_ANALYSIS_RE = re.compile( + r"]*>.*?", re.IGNORECASE | re.DOTALL +) + logging.basicConfig( level=logging.INFO, @@ -47,15 +78,69 @@ logging.basicConfig( logger = logging.getLogger(__name__) +@dataclass(frozen=True) +class _CitationRef: + idx: int + anchor: str + title: str + url: Optional[str] + source_id: str + + +@dataclass +class _MermaidFenceBlock: + info_raw: str + language: str + attrs: List[str] + source: str + + class Action: class Valves(BaseModel): TITLE_SOURCE: str = Field( default="chat_title", description="标题来源: 'chat_title' (对话标题), 'ai_generated' (AI 生成), 'markdown_title' (Markdown 标题)", ) + MERMAID_JS_URL: str = Field( + default="https://cdn.jsdelivr.net/npm/mermaid@10.9.1/dist/mermaid.min.js", + description="Mermaid JS CDN URL", + ) + MERMAID_JSZIP_URL: str = Field( + default="https://cdnjs.cloudflare.com/ajax/libs/jszip/3.10.1/jszip.min.js", + description="JSZip CDN URL (用于 DOCX 操作)", + ) + MERMAID_OPTIMIZE_LAYOUT: bool = Field( + default=True, + description="优化 Mermaid 布局: 自动将 LR (左右) 转换为 TD (上下) 以适应页面。", + ) + MERMAID_PNG_SCALE: float = Field( + default=3.0, + description="Mermaid PNG 缩放比例 (分辨率): 越高越清晰但文件越大。默认: 3.0", + ) + MERMAID_DISPLAY_SCALE: float = Field( + default=1.5, + description="Mermaid 显示比例 (视觉大小): >1.0 放大, <1.0 缩小。默认: 1.5", + ) + MERMAID_CAPTIONS_ENABLE: bool = Field( + default=True, + description="启用 Mermaid 图表标题", + ) + MERMAID_CAPTION_STYLE: str = Field( + default="Caption", + description="Mermaid 标题样式名称", + ) + MERMAID_CAPTION_PREFIX: str = Field( + default="图", + description="Mermaid 标题前缀", + ) def __init__(self): self.valves = self.Valves() + self._mermaid_figure_counter = 0 + self._caption_style_name = "" + self._citation_anchor_by_index: Dict[int, str] = {} + self._citation_refs: List[_CitationRef] = [] + self._bookmark_id_counter: int = 1 async def _send_notification(self, emitter: Callable, type: str, content: str): await emitter( @@ -154,8 +239,17 @@ class Action: top_heading = title has_h1 = bool(re.search(r"^#\s+.+$", message_content, re.MULTILINE)) + + # Extract sources if available (for citations) + sources = ( + last_assistant_message.get("sources") or body.get("sources") or [] + ) + doc = self.markdown_to_docx( - message_content, top_heading=top_heading, has_h1=has_h1 + message_content, + top_heading=top_heading, + has_h1=has_h1, + sources=sources, ) # 保存到内存 @@ -165,6 +259,13 @@ class Action: file_content = doc_buffer.read() base64_blob = base64.b64encode(file_content).decode("utf-8") + # Escape message_content for JavaScript template literal + escaped_content = ( + message_content.replace("\\", "\\\\") # Escape backslashes first + .replace("`", "\\`") # Escape backticks + .replace("${", "\\${") # Escape template literal expressions + ) + # 触发文件下载 if __event_call__: await __event_call__( @@ -172,28 +273,197 @@ class Action: "type": "execute", "data": { "code": f""" - try {{ - const base64Data = "{base64_blob}"; - const binaryData = atob(base64Data); - const arrayBuffer = new Uint8Array(binaryData.length); - for (let i = 0; i < binaryData.length; i++) {{ - arrayBuffer[i] = binaryData.charCodeAt(i); - }} - const blob = new Blob([arrayBuffer], {{ type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document" }}); - const filename = "{filename}"; + (async function() {{ + try {{ + // Parse document.xml to find placeholders and extract optimized code + // We do this FIRST to get the actual code to render (which might have been optimized in Python) + + // Load JSZip + if (!window.JSZip) {{ + await new Promise((resolve, reject) => {{ + const script = document.createElement("script"); + script.src = "{self.valves.MERMAID_JSZIP_URL}"; + script.onload = resolve; + script.onerror = reject; + document.head.appendChild(script); + }}); + }} - const url = URL.createObjectURL(blob); - const a = document.createElement("a"); - a.style.display = "none"; - a.href = url; - a.download = filename; - document.body.appendChild(a); - a.click(); - URL.revokeObjectURL(url); - document.body.removeChild(a); - }} catch (error) {{ - console.error('触发下载时出错:', error); - }} + const base64Data = "{base64_blob}"; + const binaryData = atob(base64Data); + const arrayBuffer = new Uint8Array(binaryData.length); + for (let i = 0; i < binaryData.length; i++) {{ + arrayBuffer[i] = binaryData.charCodeAt(i); + }} + + const zip = new JSZip(); + await zip.loadAsync(arrayBuffer); + + // Parse document.xml + const docXml = await zip.file("word/document.xml").async("string"); + const parser = new DOMParser(); + const xmlDoc = parser.parseFromString(docXml, "application/xml"); + + const drawings = xmlDoc.getElementsByTagName("w:drawing"); + const placeholderInfo = []; + + for (let i = 0; i < drawings.length; i++) {{ + const drawing = drawings[i]; + const docPr = drawing.getElementsByTagName("wp:docPr")[0]; + if (docPr) {{ + const descr = docPr.getAttribute("descr"); + if (descr && descr.startsWith("MERMAID_SRC:")) {{ + const encodedCode = descr.substring("MERMAID_SRC:".length); + const code = decodeURIComponent(encodedCode); + + // Find the blip and extent to replace + const parent = drawing.parentNode; // w:r usually, or w:drawing parent + // We need to find a:blip and wp:extent within this drawing + const blip = drawing.getElementsByTagName("a:blip")[0]; + const extent = drawing.getElementsByTagName("wp:extent")[0]; + + if (blip && extent) {{ + const rId = blip.getAttribute("r:embed"); + placeholderInfo.push({{ rId, extent, code }}); + }} + }} + }} + }} + + if (placeholderInfo.length === 0) {{ + console.log("No Mermaid placeholders found in DOCX."); + // Just download the file as is + const blob = new Blob([arrayBuffer], {{type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document"}}); + const url = URL.createObjectURL(blob); + const a = document.createElement("a"); + a.style.display = "none"; + a.href = url; + a.download = "{filename}"; + document.body.appendChild(a); + a.click(); + URL.revokeObjectURL(url); + document.body.removeChild(a); + return; + }} + + console.log(`Found ${{placeholderInfo.length}} Mermaid placeholders.`); + + // Load Mermaid + if (!window.mermaid) {{ + await new Promise((resolve, reject) => {{ + const script = document.createElement("script"); + script.src = "{self.valves.MERMAID_JS_URL}"; + script.onload = resolve; + script.onerror = reject; + document.head.appendChild(script); + }}); + }} + + mermaid.initialize({{ + startOnLoad: false, + theme: 'default', + }}); + + // Read rels XML once + const relsXml = await zip.file("word/_rels/document.xml.rels").async("string"); + const relsDoc = parser.parseFromString(relsXml, "application/xml"); + const relationships = relsDoc.getElementsByTagName("Relationship"); + const rIdToPath = {{}}; + + for (let i = 0; i < relationships.length; i++) {{ + const rel = relationships[i]; + rIdToPath[rel.getAttribute("Id")] = rel.getAttribute("Target"); + }} + + // Render and replace + console.log(`Processing ${{placeholderInfo.length}} diagrams...`); + + for (let i = 0; i < placeholderInfo.length; i++) {{ + const {{ rId, extent, code }} = placeholderInfo[i]; + const imagePath = "word/" + rIdToPath[rId]; + + console.log(`Block ${{i + 1}}/${{placeholderInfo.length}}: Rendering and replacing at ${{imagePath}}`); + + // Render SVG + const id = "mermaid-export-" + i; + const {{ svg }} = await mermaid.render(id, code); + + // Convert SVG to PNG + const canvas = document.createElement("canvas"); + const ctx = canvas.getContext("2d"); + const img = new Image(); + + // Get SVG dimensions + const svgMatch = svg.match(/viewBox="[^"]*\s+[^"]*\s+([^"\s]+)\s+([^"\s]+)"/); + let width = 800; + let height = 600; + if (svgMatch) {{ + width = parseFloat(svgMatch[1]); + height = parseFloat(svgMatch[2]); + }} + + // Scale up for better quality + const scale = {self.valves.MERMAID_PNG_SCALE}; + canvas.width = width * scale; + canvas.height = height * scale; + + await new Promise((resolve, reject) => {{ + img.onload = resolve; + img.onerror = reject; + img.src = "data:image/svg+xml;base64," + btoa(unescape(encodeURIComponent(svg))); + }}); + + ctx.scale(scale, scale); + ctx.drawImage(img, 0, 0, width, height); + + const pngDataUrl = canvas.toDataURL("image/png"); + const pngBase64 = pngDataUrl.split(",")[1]; + + // Replace image in ZIP + zip.file(imagePath, pngBase64, {{base64: true}}); + + // Update dimensions in document.xml (EMUs) + // 1 inch = 914400 EMUs, 1 pixel ≈ 9525 EMUs at 96 DPI + // Max width: ~6 inches (page width minus margins) + const maxWidthEmu = 5486400; // 6 inches + const displayScale = {self.valves.MERMAID_DISPLAY_SCALE}; + let emuWidth = Math.round(width * 9525 * displayScale); + let emuHeight = Math.round(height * 9525 * displayScale); + + // Scale down if too wide + if (emuWidth > maxWidthEmu) {{ + const scaleFactor = maxWidthEmu / emuWidth; + emuWidth = maxWidthEmu; + emuHeight = Math.round(emuHeight * scaleFactor); + }} + + extent.setAttribute("cx", emuWidth); + extent.setAttribute("cy", emuHeight); + }} + + // Serialize updated XML + const serializer = new XMLSerializer(); + const newDocXml = serializer.serializeToString(xmlDoc); + zip.file("word/document.xml", newDocXml); + + // Generate final blob + const finalBlob = await zip.generateAsync({{type: "blob"}}); + const filename = "{filename}"; + + const url = URL.createObjectURL(finalBlob); + const a = document.createElement("a"); + a.style.display = "none"; + a.href = url; + a.download = filename; + document.body.appendChild(a); + a.click(); + URL.revokeObjectURL(url); + document.body.removeChild(a); + }} catch (error) {{ + console.error('Export failed:', error); + alert('导出失败: ' + error.message); + }} + }})(); """ }, } @@ -334,18 +604,362 @@ class Action: """清理文件名中的非法字符""" return re.sub(r'[\\/*?:"<>|]', "", name).strip()[:50] + def _strip_reasoning_blocks(self, text: str) -> str: + """ + Strip model reasoning blocks from assistant Markdown before export. + """ + if not text: + return text + + cur = text + for _ in range(10): + prev = cur + cur = _REASONING_DETAILS_RE.sub("", cur) + cur = _THINK_RE.sub("", cur) + cur = _ANALYSIS_RE.sub("", cur) + if cur == prev: + break + + # Clean up excessive blank lines left by removals. + cur = re.sub(r"\n{4,}", "\n\n\n", cur) + return cur + + def _add_display_equation(self, doc: Document, latex: str): + latex = (latex or "").strip() + if not latex: + return + + if not LATEX_MATH_AVAILABLE: + self.add_code_block(doc, latex, "latex") + return + + try: + mathml = latex_to_mathml(latex) + omml = mathml2omml.convert(mathml) + para = doc.add_paragraph() + para.alignment = WD_ALIGN_PARAGRAPH.CENTER + cast(Any, para)._p.append(self._wrap_omml_for_word(omml)) + except Exception as exc: + logger.warning(f"Math conversion failed; falling back to text: {exc}") + self.add_code_block(doc, latex, "latex") + + def _wrap_omml_for_word(self, omml: str): + m_ns = "http://schemas.openxmlformats.org/officeDocument/2006/math" + w_ns = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + # Keep the OMML payload as-is, but ensure it has the math namespace declared. + xml = f'{omml}' + return parse_xml(xml) + + def _add_inline_equation( + self, + paragraph, + latex: str, + bold: bool = False, + italic: bool = False, + strike: bool = False, + ): + latex = (latex or "").strip() + if not latex: + return + + if not LATEX_MATH_AVAILABLE: + self._add_text_run( + paragraph, f"\\({latex}\\)", bold=bold, italic=italic, strike=strike + ) + return + + try: + mathml = latex_to_mathml(latex) + omml = mathml2omml.convert(mathml) + o_math = self._omml_oMath_element(omml) + run = paragraph.add_run() + run.bold = bold + run.italic = italic + run.font.strike = strike + cast(Any, run)._r.append(o_math) + except Exception as exc: + logger.warning(f"Inline math conversion failed; keeping literal: {exc}") + self._add_text_run( + paragraph, f"\\({latex}\\)", bold=bold, italic=italic, strike=strike + ) + + def _omml_oMath_element(self, omml: str): + # Ensure the OMML element declares the math namespace so parse_xml works. + m_ns = "http://schemas.openxmlformats.org/officeDocument/2006/math" + s = (omml or "").strip() + if s.startswith("") and s.endswith(""): + inner = s[len("") : -len("")] + s = f'{inner}' + elif s.startswith("", 1)[0]: + s = s.replace(" List[_CitationRef]: + citation_idx_map: Dict[str, int] = {} + refs_by_idx: Dict[int, _CitationRef] = {} + + for source in sources or []: + if not isinstance(source, dict): + continue + + documents = source.get("document") or [] + metadatas = source.get("metadata") or [] + src_info = source.get("source") or {} + + src_name = src_info.get("name") if isinstance(src_info, dict) else None + src_id_default = src_info.get("id") if isinstance(src_info, dict) else None + src_urls = src_info.get("urls") if isinstance(src_info, dict) else None + + if not isinstance(documents, list): + documents = [] + if not isinstance(metadatas, list): + metadatas = [] + + for idx_doc, _doc_text in enumerate(documents): + meta = metadatas[idx_doc] if idx_doc < len(metadatas) else {} + if not isinstance(meta, dict): + meta = {} + + source_id = meta.get("source") or src_id_default or "N/A" + source_id_str = str(source_id) + + if source_id_str not in citation_idx_map: + citation_idx_map[source_id_str] = len(citation_idx_map) + 1 + idx = citation_idx_map[source_id_str] + + if idx in refs_by_idx: + continue + + url: Optional[str] = None + if isinstance(source_id, str) and re.match(r"^https?://", source_id): + url = source_id + elif isinstance(meta.get("url"), str) and re.match( + r"^https?://", meta["url"] + ): + url = meta["url"] + elif isinstance(src_urls, list) and src_urls: + if isinstance(src_urls[0], str) and re.match( + r"^https?://", src_urls[0] + ): + url = src_urls[0] + + title = ( + (meta.get("title") if isinstance(meta.get("title"), str) else None) + or (meta.get("name") if isinstance(meta.get("name"), str) else None) + or ( + src_name + if isinstance(src_name, str) and src_name.strip() + else None + ) + or (url if url else None) + or source_id_str + ) + + anchor = f"OWUIRef{idx}" + refs_by_idx[idx] = _CitationRef( + idx=idx, + anchor=anchor, + title=title, + url=url, + source_id=source_id_str, + ) + + return [refs_by_idx[i] for i in sorted(refs_by_idx.keys())] + + def _add_bookmark(self, paragraph, name: str): + bookmark_id = self._bookmark_id_counter + self._bookmark_id_counter += 1 + + start = OxmlElement("w:bookmarkStart") + start.set(qn("w:id"), str(bookmark_id)) + start.set(qn("w:name"), name) + + end = OxmlElement("w:bookmarkEnd") + end.set(qn("w:id"), str(bookmark_id)) + + p = cast(Any, paragraph)._p + p.insert(0, start) + p.append(end) + + def _add_internal_hyperlink(self, paragraph, display_text: str, anchor: str): + hyperlink = OxmlElement("w:hyperlink") + hyperlink.set(qn("w:anchor"), anchor) + + new_run = OxmlElement("w:r") + rPr = OxmlElement("w:rPr") + rStyle = OxmlElement("w:rStyle") + rStyle.set(qn("w:val"), "Hyperlink") + rPr.append(rStyle) + + new_run.append(rPr) + t = OxmlElement("w:t") + t.text = display_text + new_run.append(t) + + hyperlink.append(new_run) + cast(Any, paragraph)._p.append(hyperlink) + + def _add_references_section(self, doc: Document): + self.add_heading(doc, "参考资料", 2) + + for ref in self._citation_refs: + para = doc.add_paragraph(style="List Number") + self._add_bookmark(para, ref.anchor) + # Include URL as an external link when available. + if ref.url: + self._add_hyperlink(para, ref.title, ref.url, display_text=ref.title) + else: + self._add_text_run( + para, ref.title, bold=False, italic=False, strike=False + ) + + def _normalize_url(self, url: str) -> str: + u = (url or "").strip() + if u.lower().startswith("www."): + u = "https://" + u + + # Trim common trailing punctuation that often follows URLs in prose. + while u and u[-1] in ".,;:!?)]}": + u = u[:-1] + return u + + def _add_hyperlink( + self, paragraph, text: str, url: str, display_text: Optional[str] = None + ): + u = self._normalize_url(url) + if not u: + paragraph.add_run(display_text or text) + return + + part = getattr(paragraph, "part", None) + if part is None or not hasattr(part, "relate_to"): + # Fallback if relationship API isn't available. + run = paragraph.add_run(display_text or text) + run.font.color.rgb = RGBColor(0, 0, 255) + run.font.underline = True + return + + r_id = part.relate_to(u, RT.HYPERLINK, is_external=True) + + hyperlink = OxmlElement("w:hyperlink") + hyperlink.set(qn("r:id"), r_id) + + new_run = OxmlElement("w:r") + rPr = OxmlElement("w:rPr") + rStyle = OxmlElement("w:rStyle") + rStyle.set(qn("w:val"), "Hyperlink") + rPr.append(rStyle) + + color = OxmlElement("w:color") + color.set(qn("w:val"), "0000FF") + rPr.append(color) + + u_el = OxmlElement("w:u") + u_el.set(qn("w:val"), "single") + rPr.append(u_el) + + new_run.append(rPr) + + t = OxmlElement("w:t") + t.text = display_text or text + new_run.append(t) + + hyperlink.append(new_run) + cast(Any, paragraph)._p.append(hyperlink) + + def _add_text_run(self, paragraph, s: str, bold: bool, italic: bool, strike: bool): + if not s: + return + run = paragraph.add_run(s) + if bold: + run.bold = True + if italic: + run.italic = True + if strike: + run.font.strike = True + + # Set Chinese font (copying from existing add_paragraph logic) + run.font.name = "Times New Roman" + run._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体") + + def _add_inline_code(self, paragraph, s: str): + if s == "": + return + + # Simple inline code without URL parsing for now, or copy full logic if needed. + # For now, just basic styling to match existing. + run = paragraph.add_run(s) + run.font.name = "Consolas" + run._element.rPr.rFonts.set(qn("w:eastAsia"), "SimHei") + run.font.size = Pt(10) + shading = OxmlElement("w:shd") + shading.set(qn("w:fill"), "E8E8E8") + run._element.rPr.append(shading) + + def _add_hyperlink_code(self, paragraph, display_text: str, url: str): + u = self._normalize_url(url) + if not u: + self._add_inline_code(paragraph, display_text) + return + + part = getattr(paragraph, "part", None) + if part is None or not hasattr(part, "relate_to"): + self._add_inline_code(paragraph, display_text) + return + + r_id = part.relate_to(u, RT.HYPERLINK, is_external=True) + + hyperlink = OxmlElement("w:hyperlink") + hyperlink.set(qn("r:id"), r_id) + + new_run = OxmlElement("w:r") + rPr = OxmlElement("w:rPr") + + rFonts = OxmlElement("w:rFonts") + rFonts.set(qn("w:ascii"), "Consolas") + rFonts.set(qn("w:hAnsi"), "Consolas") + rFonts.set(qn("w:eastAsia"), "SimHei") + rPr.append(rFonts) + + sz = OxmlElement("w:sz") + sz.set(qn("w:val"), "20") # 10pt + rPr.append(sz) + + shading = OxmlElement("w:shd") + shading.set(qn("w:fill"), "E8E8E8") + rPr.append(shading) + + new_run.append(rPr) + + t = OxmlElement("w:t") + t.text = display_text + new_run.append(t) + + hyperlink.append(new_run) + cast(Any, paragraph)._p.append(hyperlink) + def markdown_to_docx( - self, markdown_text: str, top_heading: str = "", has_h1: bool = False + self, + markdown_text: str, + top_heading: str = "", + has_h1: bool = False, + sources: Optional[List[dict]] = None, ) -> Document: """ 将 Markdown 文本转换为 Word 文档 - 支持:标题、段落、粗体、斜体、代码块、列表、表格、链接 + 支持:标题、段落、粗体、斜体、代码块、列表、表格、链接、原生数学公式、引用和移除思考过程。 """ doc = Document() # 设置默认中文字体 self.set_document_default_font(doc) + # 构建引用 + self._citation_refs = self._build_citation_refs(sources) + + # 移除思考过程 + markdown_text = self._strip_reasoning_blocks(markdown_text) + # 若正文无一级标题且有对话标题,则作为一级标题写入 if top_heading and not has_h1: self.add_heading(doc, top_heading, 1) @@ -372,14 +986,44 @@ class Action: in_list = False in_code_block = True - code_block_lang = line.strip()[3:].strip() + code_block_info_raw = line.strip()[3:].strip() + code_block_lang, code_block_attrs = self._parse_fence_info( + code_block_info_raw + ) code_block_content = [] else: # 代码块结束 in_code_block = False - self.add_code_block( - doc, "\n".join(code_block_content), code_block_lang - ) + code_text = "\n".join(code_block_content) + + # 检查是否为 Mermaid 或 Flowchart + mermaid_langs = { + "mermaid", + "flowchart", + "sequence", + "gantt", + "class", + "state", + "pie", + "er", + "journey", + "gitgraph", + "mindmap", + } + + if code_block_lang.lower() in mermaid_langs: + # 创建 Mermaid 块对象 + block = _MermaidFenceBlock( + info_raw=code_block_info_raw, + language=code_block_lang, + attrs=code_block_attrs, + source=code_text, + ) + # 插入占位符 + self._insert_mermaid_placeholder(doc, block) + else: + self.add_code_block(doc, code_text, code_block_lang) + code_block_content = [] code_block_lang = "" i += 1 @@ -390,6 +1034,66 @@ class Action: i += 1 continue + # 处理数学块: $$...$$ 或 \[...\] + # 简单检测: 如果行以 $$ 或 \[ 开头,则视为数学块开始 + stripped_line = line.strip() + if stripped_line.startswith("$$") or stripped_line.startswith("\\["): + # 先处理之前积累的列表 + if in_list and list_items: + self.add_list_to_doc(doc, list_items, list_type) + list_items = [] + in_list = False + + # 检查是否为单行块,如 $$ E=mc^2 $$ + if ( + stripped_line.startswith("$$") + and stripped_line.endswith("$$") + and len(stripped_line) > 4 + ) or ( + stripped_line.startswith("\\[") + and stripped_line.endswith("\\]") + and len(stripped_line) > 4 + ): + # 提取内容 + if stripped_line.startswith("$$"): + math_content = stripped_line[2:-2] + else: + math_content = stripped_line[2:-2] + self._add_display_equation(doc, math_content) + i += 1 + continue + + # 多行数学块 + math_lines = [] + # 移除开头标记 + if stripped_line.startswith("$$"): + current_line_content = stripped_line[2:] + end_marker = "$$" + else: + current_line_content = stripped_line[2:] + end_marker = "\\]" + + if current_line_content.strip(): + math_lines.append(current_line_content) + + i += 1 + block_closed = False + while i < len(lines): + next_line = lines[i] + if next_line.strip().endswith(end_marker): + # 找到结束标记 + content_before_end = next_line.strip()[: -len(end_marker)] + if content_before_end.strip(): + math_lines.append(content_before_end) + block_closed = True + i += 1 + break + math_lines.append(next_line) + i += 1 + + self._add_display_equation(doc, "\n".join(math_lines)) + continue + # 处理表格 if line.strip().startswith("|") and line.strip().endswith("|"): # 先处理之前积累的列表 @@ -501,6 +1205,10 @@ class Action: if in_list and list_items: self.add_list_to_doc(doc, list_items, list_type) + # 添加参考资料章节 + if self._citation_refs: + self._add_references_section(doc) + return doc def set_document_default_font(self, doc: Document): @@ -547,10 +1255,16 @@ class Action: def add_formatted_text(self, paragraph, text: str): """ 解析 Markdown 内联格式并添加到段落 - 支持:粗体、斜体、行内代码、链接、删除线 + 支持:粗体、斜体、行内代码、链接、删除线、行内公式、引用 """ # 定义格式化模式 patterns = [ + # Inline Math \( ... \) + (r"\\\((.+?)\\\)", {"math": True}), + # Inline Math $...$ (single dollar signs, non-greedy) + (r"(? pos: plain_text = text[pos : match["start"]] if plain_text: - paragraph.add_run(plain_text) + self._add_text_run(paragraph, plain_text, False, False, False) # 添加格式化文本 style = match["style"] run_text = match["text"] - if style.get("link"): - # 链接处理 - run = paragraph.add_run(run_text) - run.font.color.rgb = RGBColor(0, 0, 255) - run.font.underline = True + if style.get("math"): + self._add_inline_equation(paragraph, run_text) + elif style.get("citation"): + idx = int(run_text) + # Find the anchor for this index + ref = next((r for r in self._citation_refs if r.idx == idx), None) + if ref: + self._add_internal_hyperlink(paragraph, f"[{idx}]", ref.anchor) + else: + self._add_text_run(paragraph, f"[{idx}]", False, False, False) + elif style.get("link"): + # 处理链接 + self._add_hyperlink(paragraph, run_text, match["url"]) elif style.get("code"): # 行内代码 - run = paragraph.add_run(run_text) - run.font.name = "Consolas" - run._element.rPr.rFonts.set(qn("w:eastAsia"), "SimHei") - run.font.size = Pt(10) - # 添加背景色 - shading = OxmlElement("w:shd") - shading.set(qn("w:fill"), "E8E8E8") - run._element.rPr.append(shading) + self._add_inline_code(paragraph, run_text) else: - run = paragraph.add_run(run_text) - if style.get("bold"): - run.bold = True - if style.get("italic"): - run.italic = True - if style.get("strike"): - run.font.strike = True + # For bold/italic/strike, check if the text contains inline math + # Pattern for inline math: \(...\) or $...$ + math_pattern = r"(\\\((.+?)\\\)|\$([^$]+?)\$)" + math_matches = list(re.finditer(math_pattern, run_text)) + + if math_matches: + # Process text with inline math + text_pos = 0 + for math_match in math_matches: + # Add text before math + if math_match.start() > text_pos: + before_text = run_text[text_pos : math_match.start()] + self._add_text_run( + paragraph, + before_text, + bold=style.get("bold", False), + italic=style.get("italic", False), + strike=style.get("strike", False), + ) + # Add inline equation with formatting + latex_content = math_match.group(2) or math_match.group(3) + self._add_inline_equation( + paragraph, + latex_content, + bold=style.get("bold", False), + italic=style.get("italic", False), + strike=style.get("strike", False), + ) + text_pos = math_match.end() + # Add remaining text after last math + if text_pos < len(run_text): + self._add_text_run( + paragraph, + run_text[text_pos:], + bold=style.get("bold", False), + italic=style.get("italic", False), + strike=style.get("strike", False), + ) + else: + self._add_text_run( + paragraph, + run_text, + bold=style.get("bold", False), + italic=style.get("italic", False), + strike=style.get("strike", False), + ) pos = match["end"] - # 添加剩余的普通文本 if pos < len(text): - paragraph.add_run(text[pos:]) + self._add_text_run(paragraph, text[pos:], False, False, False) def add_code_block(self, doc: Document, code: str, language: str = ""): """添加代码块,支持语法高亮""" @@ -735,75 +1496,258 @@ class Action: run._element.rPr.rFonts.set(qn("w:eastAsia"), "SimHei") run.font.size = Pt(10) + def _insert_mermaid_placeholder( + self, doc: Document, block: Union[_MermaidFenceBlock, str] + ): + self._mermaid_figure_counter += 1 + if isinstance(block, str): + code = block + else: + code = block.source + + # 为每个占位符创建唯一的透明 PNG + # 通过改变图片尺寸来确保 python-docx 不会重用同一个图片文件 + # 使用 figure_counter 来创建不同尺寸(1x1, 1x2, 1x3, ...) + from PIL import Image + + # 创建一个透明图片,尺寸为 1 x counter(确保每个都不同) + img = Image.new("RGBA", (1, self._mermaid_figure_counter), (0, 0, 0, 0)) + image_stream = io.BytesIO() + img.save(image_stream, format="PNG") + image_stream.seek(0) + + # 添加居中段落 + paragraph = doc.add_paragraph() + paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER + run = paragraph.add_run() + + # 添加图片 (默认大小,稍后 JS 会调整) + picture = run.add_picture(image_stream, width=Inches(1)) + + # 设置 Alt Text (描述) 为 "MERMAID_SRC:" + # 这是 Python 和 JS 之间的魔法链接 + import urllib.parse + + encoded_code = urllib.parse.quote(code) + + # 访问底层 XML 设置 docPr descr + inline = picture._inline + docPr = inline.docPr + + # 使用 .set() 确保属性写入 XML + docPr.set("descr", f"MERMAID_SRC:{encoded_code}") + docPr.set("title", "Mermaid Diagram Placeholder") + + # 添加标题 + if self.valves.MERMAID_CAPTIONS_ENABLE: + self._add_mermaid_caption(doc, self._mermaid_figure_counter) + + def _add_mermaid_caption(self, doc: Document, figure_number: int): + if not self._caption_style_name: + self._ensure_caption_style(doc) + self._caption_style_name = self.valves.MERMAID_CAPTION_STYLE + + caption_text = f"{self.valves.MERMAID_CAPTION_PREFIX} {figure_number}" + paragraph = doc.add_paragraph(caption_text, style=self._caption_style_name) + paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER + paragraph.paragraph_format.keep_with_next = False + + def _ensure_caption_style(self, doc: Document): + style_name = self.valves.MERMAID_CAPTION_STYLE + styles = doc.styles + if style_name not in styles: + style = styles.add_style(style_name, WD_STYLE_TYPE.PARAGRAPH) + style.base_style = styles["Normal"] + style.next_paragraph_style = styles["Normal"] + font = style.font + font.name = "Times New Roman" + font.size = Pt(10) + font.italic = True + font.color.rgb = RGBColor(0x55, 0x55, 0x55) # 深灰色 + + def _parse_fence_info(self, info_raw: str) -> Tuple[str, List[str]]: + parts = info_raw.split() + if not parts: + return "", [] + lang = parts[0] + attrs = parts[1:] + return lang, attrs + def add_table(self, doc: Document, table_lines: List[str]): - """添加表格,支持表头底色与隔行底色""" + """添加 Markdown 表格,支持智能列宽、对齐和单元格内格式化""" if len(table_lines) < 2: return + header_fill = "F2F2F2" + zebra_fill = "FBFBFB" + + def _split_row(line: str) -> List[str]: + raw = line.strip().strip("|") + return [c.strip() for c in raw.split("|")] + + def _is_separator_row(cells: List[str]) -> bool: + if not cells: + return False + ok = 0 + for c in cells: + c = c.strip() + if re.fullmatch(r":?-{3,}:?", c): + ok += 1 + return ok == len(cells) + + def _col_align(cell: str) -> WD_ALIGN_PARAGRAPH: + s = (cell or "").strip() + if s.startswith(":") and s.endswith(":"): + return WD_ALIGN_PARAGRAPH.CENTER + if s.endswith(":"): + return WD_ALIGN_PARAGRAPH.RIGHT + return WD_ALIGN_PARAGRAPH.LEFT + def _set_cell_shading(cell, fill: str): tc_pr = cell._element.get_or_add_tcPr() shd = OxmlElement("w:shd") shd.set(qn("w:fill"), fill) tc_pr.append(shd) - header_fill = "F2F2F2" - zebra_fill = "FBFBFB" - - # 解析表格数据 - rows = [] - for line in table_lines: - cells = [cell.strip() for cell in line.strip().strip("|").split("|")] - # 跳过分隔行 - if all(re.fullmatch(r"[-:]+", cell) for cell in cells): - continue - rows.append(cells) - - if not rows: + raw_rows = [_split_row(l) for l in table_lines if l.strip().startswith("|")] + if not raw_rows: return - # 确定列数 - num_cols = max(len(row) for row in rows) + sep_idx = 1 if len(raw_rows) > 1 and _is_separator_row(raw_rows[1]) else -1 + header = raw_rows[0] + body = raw_rows[sep_idx + 1 :] if sep_idx >= 0 else raw_rows[1:] - # 创建表格 - table = doc.add_table(rows=len(rows), cols=num_cols) + num_cols = max(len(header), *(len(r) for r in body)) if body else len(header) + header = header + [""] * (num_cols - len(header)) + body = [r + [""] * (num_cols - len(r)) for r in body] + + aligns = [ + _col_align(c) for c in (raw_rows[1] if sep_idx == 1 else [""] * num_cols) + ] + + table = doc.add_table(rows=1 + len(body), cols=num_cols) table.style = "Table Grid" - table.alignment = WD_TABLE_ALIGNMENT.CENTER + table.alignment = WD_TABLE_ALIGNMENT.LEFT + cast(Any, table).autofit = False - # 填充表格 - for row_idx, row_data in enumerate(rows): - row = table.rows[row_idx] - for col_idx, cell_text in enumerate(row_data): - if col_idx < num_cols: - cell = row.cells[col_idx] - # 清除默认段落 - cell.paragraphs[0].clear() - para = cell.paragraphs[0] - para.paragraph_format.space_after = Pt(3) - para.paragraph_format.space_before = Pt(1) - para.alignment = WD_ALIGN_PARAGRAPH.LEFT + # 单元格边距 + self._set_table_cell_margins(table, top=60, bottom=60, left=90, right=90) - self.add_formatted_text(para, cell_text) + # 列宽按内容比例分配 + available_width = int(self._available_block_width(doc)) + min_col = max(int(Inches(0.55)), available_width // max(1, num_cols * 3)) - # 设置单元格字体 - for run in para.runs: - run.font.name = "Times New Roman" - run._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体") - run.font.size = Pt(10) + def _plain_len(s: str) -> int: + t = re.sub(r"`([^`]+)`", r"\1", s or "") + t = re.sub(r"\[([^\]]+)\]\(([^)]+)\)", r"\1", t) + t = re.sub(r"\s+", " ", t).strip() + return len(t) - # 表头加粗并填充底色 - if row_idx == 0: - for run in para.runs: - run.bold = True - _set_cell_shading(cell, header_fill) - # 隔行底色 - elif row_idx % 2 == 1: - _set_cell_shading(cell, zebra_fill) + weights: List[int] = [] + for ci in range(num_cols): + max_len = _plain_len(header[ci]) + for r in body: + max_len = max(max_len, _plain_len(r[ci])) + weights.append(max(1, min(max_len, 40))) - # 统一列对齐为左对齐,避免居中导致阅读困难 - for row in table.rows: - for cell in row.cells: - for para in cell.paragraphs: - para.alignment = WD_ALIGN_PARAGRAPH.LEFT + sum_w = sum(weights) or 1 + widths = [max(min_col, int(available_width * w / sum_w)) for w in weights] + total = sum(widths) + if total > available_width: + even = max(1, available_width // max(1, num_cols)) + widths = [even] * num_cols + total = sum(widths) + if total < available_width: + rem = available_width - total + order = sorted(range(num_cols), key=lambda i: weights[i], reverse=True) + oi = 0 + while rem > 0 and order: + widths[order[oi % len(order)]] += 1 + rem -= 1 + oi += 1 + + for ci, w in enumerate(widths): + table.columns[ci].width = w + for row in table.rows: + row.cells[ci].width = w + + def _format_cell_paragraph(para, align: WD_ALIGN_PARAGRAPH): + para.alignment = align + pf = para.paragraph_format + pf.space_before = Pt(0) + pf.space_after = Pt(0) + pf.line_spacing_rule = WD_LINE_SPACING.SINGLE + + def _fill_cell(cell, text: str, align: WD_ALIGN_PARAGRAPH, bold: bool = False): + cell.text = "" + parts = [ + p for p in re.split(r"(?:|\\n)", text or "") if p is not None + ] + if not parts: + parts = [""] + for pi, part in enumerate(parts): + para = cell.paragraphs[0] if pi == 0 else cell.add_paragraph() + _format_cell_paragraph(para, align) + self.add_formatted_text(para, part) + for run in para.runs: + run.font.size = Pt(9) + run.font.name = "Times New Roman" + run._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体") + if bold: + run.bold = True + + # 表头行 + header_row = table.rows[0] + self._set_table_header_row_repeat(header_row) + for ci in range(num_cols): + cell = header_row.cells[ci] + _set_cell_shading(cell, header_fill) + _fill_cell( + cell, + header[ci], + aligns[ci] if ci < len(aligns) else WD_ALIGN_PARAGRAPH.LEFT, + bold=True, + ) + + # 数据行 + for ri, row_data in enumerate(body, start=1): + row = table.rows[ri] + for ci in range(num_cols): + cell = row.cells[ci] + if (ri % 2) == 0: + _set_cell_shading(cell, zebra_fill) + _fill_cell( + cell, + row_data[ci], + aligns[ci] if ci < len(aligns) else WD_ALIGN_PARAGRAPH.LEFT, + ) + + def _available_block_width(self, doc: Document): + section = doc.sections[0] + return section.page_width - section.left_margin - section.right_margin + + def _set_table_cell_margins( + self, table, top: int, bottom: int, left: int, right: int + ): + tbl_pr = cast(Any, table)._tbl.tblPr + tbl_cell_mar = OxmlElement("w:tblCellMar") + for tag, val in ( + ("top", top), + ("bottom", bottom), + ("left", left), + ("right", right), + ): + el = OxmlElement(f"w:{tag}") + el.set(qn("w:w"), str(int(val))) + el.set(qn("w:type"), "dxa") + tbl_cell_mar.append(el) + tbl_pr.append(tbl_cell_mar) + + def _set_table_header_row_repeat(self, row): + tr_pr = row._tr.get_or_add_trPr() + tbl_header = OxmlElement("w:tblHeader") + tbl_header.set(qn("w:val"), "true") + tr_pr.append(tbl_header) def add_list_to_doc( self, doc: Document, items: List[Tuple[int, str]], list_type: str