Add export to Word functionality with Markdown support

- Implemented a new plugin to export current conversation content from Markdown to Word (.docx) format. - Added README_CN.md for Chinese documentation, detailing features, supported Markdown syntax, usage instructions, font configurations, and author information. - Developed export_to_word.py to handle the conversion process, including title extraction, document creation, and file download triggering. - Included support for various Markdown elements such as headings, lists, tables, and inline formatting. - Ensured proper handling of Chinese and English text without encoding issues.
2025-12-30 13:29:44 +08:00
parent b0fad966ce
commit 278a6ce12c
4 changed files with 1244 additions and 0 deletions
--- a/plugins/actions/export_to_docx/导出为Word.py
+++ b/plugins/actions/export_to_docx/导出为Word.py
@@ -0,0 +1,574 @@
+"""
+title: 导出为 Word
+author: Fu-Jie
+author_url: https://github.com/Fu-Jie
+funding_url: https://github.com/Fu-Jie/awesome-openwebui
+version: 0.1.0
+icon_url: data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyNCIgaGVpZ2h0PSIyNCIgdmlld0JveD0iMCAwIDI0IDI0IiBmaWxsPSJub25lIiBzdHJva2U9ImN1cnJlbnRDb2xvciIgc3Ryb2tlLXdpZHRoPSIyIiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiPjxwYXRoIGQ9Ik0xNCAySDZhMiAyIDAgMCAwLTIgMnYxNmEyIDIgMCAwIDAgMiAyaDEyYTIgMiAwIDAgMCAyLTJWOFoiLz48cGF0aCBkPSJNMTQgMnY2aDYiLz48cGF0aCBkPSJNMTYgMTNoLTIuNWEyIDIgMCAwIDAgMCA0SDEyIi8+PHBhdGggZD0iTTggMTNoMiIvPjxwYXRoIGQ9Ik04IDE3aDIiLz48L3N2Zz4=
+requirements: python-docx==1.1.2
+description: 将当前对话内容从 Markdown 转换并导出为 Word (.docx) 文件，支持中英文无乱码。
+"""
+
+import os
+import re
+import base64
+import datetime
+import io
+from typing import Optional, Callable, Awaitable, Any, List, Tuple
+from docx import Document
+from docx.shared import Pt, Inches, RGBColor, Cm
+from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_LINE_SPACING
+from docx.enum.table import WD_TABLE_ALIGNMENT
+from docx.enum.style import WD_STYLE_TYPE
+from docx.oxml.ns import qn
+from docx.oxml import OxmlElement
+
+
+class Action:
+    def __init__(self):
+        pass
+
+    async def _send_notification(self, emitter: Callable, type: str, content: str):
+        await emitter(
+            {"type": "notification", "data": {"type": type, "content": content}}
+        )
+
+    async def action(
+        self,
+        body: dict,
+        __user__=None,
+        __event_emitter__=None,
+        __event_call__: Optional[Callable[[Any], Awaitable[None]]] = None,
+    ):
+        print(f"action:{__name__}")
+
+        # 解析用户信息
+        if isinstance(__user__, (list, tuple)):
+            user_language = (
+                __user__[0].get("language", "zh-CN") if __user__ else "zh-CN"
+            )
+            user_name = __user__[0].get("name", "用户") if __user__[0] else "用户"
+            user_id = (
+                __user__[0]["id"]
+                if __user__ and "id" in __user__[0]
+                else "unknown_user"
+            )
+        elif isinstance(__user__, dict):
+            user_language = __user__.get("language", "zh-CN")
+            user_name = __user__.get("name", "用户")
+            user_id = __user__.get("id", "unknown_user")
+
+        if __event_emitter__:
+            last_assistant_message = body["messages"][-1]
+
+            await __event_emitter__(
+                {
+                    "type": "status",
+                    "data": {"description": "正在转换为 Word 文档...", "done": False},
+                }
+            )
+
+            try:
+                message_content = last_assistant_message["content"]
+
+                if not message_content or not message_content.strip():
+                    await self._send_notification(
+                        __event_emitter__, "error", "没有找到可导出的内容！"
+                    )
+                    return
+
+                # 生成文件名
+                title = self.extract_title(message_content)
+                current_datetime = datetime.datetime.now()
+                formatted_date = current_datetime.strftime("%Y%m%d")
+
+                if title:
+                    filename = f"{self.clean_filename(title)}.docx"
+                else:
+                    filename = f"{user_name}_{formatted_date}.docx"
+
+                # 创建 Word 文档
+                doc = self.markdown_to_docx(message_content)
+
+                # 保存到内存
+                doc_buffer = io.BytesIO()
+                doc.save(doc_buffer)
+                doc_buffer.seek(0)
+                file_content = doc_buffer.read()
+                base64_blob = base64.b64encode(file_content).decode("utf-8")
+
+                # 触发文件下载
+                if __event_call__:
+                    await __event_call__(
+                        {
+                            "type": "execute",
+                            "data": {
+                                "code": f"""
+                                try {{
+                                    const base64Data = "{base64_blob}";
+                                    const binaryData = atob(base64Data);
+                                    const arrayBuffer = new Uint8Array(binaryData.length);
+                                    for (let i = 0; i < binaryData.length; i++) {{
+                                        arrayBuffer[i] = binaryData.charCodeAt(i);
+                                    }}
+                                    const blob = new Blob([arrayBuffer], {{ type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document" }});
+                                    const filename = "{filename}";
+
+                                    const url = URL.createObjectURL(blob);
+                                    const a = document.createElement("a");
+                                    a.style.display = "none";
+                                    a.href = url;
+                                    a.download = filename;
+                                    document.body.appendChild(a);
+                                    a.click();
+                                    URL.revokeObjectURL(url);
+                                    document.body.removeChild(a);
+                                }} catch (error) {{
+                                    console.error('触发下载时出错:', error);
+                                }}
+                                """
+                            },
+                        }
+                    )
+
+                await __event_emitter__(
+                    {
+                        "type": "status",
+                        "data": {"description": "Word 文档已导出", "done": True},
+                    }
+                )
+
+                await self._send_notification(
+                    __event_emitter__, "success", f"已成功导出为 {filename}"
+                )
+
+                return {"message": "下载事件已触发"}
+
+            except Exception as e:
+                print(f"Error exporting to Word: {str(e)}")
+                await __event_emitter__(
+                    {
+                        "type": "status",
+                        "data": {
+                            "description": f"导出失败: {str(e)}",
+                            "done": True,
+                        },
+                    }
+                )
+                await self._send_notification(
+                    __event_emitter__, "error", f"导出 Word 文档时出错: {str(e)}"
+                )
+
+    def extract_title(self, content: str) -> str:
+        """从 Markdown 内容中提取标题"""
+        lines = content.split("\n")
+        for line in lines:
+            # 匹配 h1-h3 标题
+            match = re.match(r"^#{1,3}\s+(.+)$", line.strip())
+            if match:
+                return match.group(1).strip()
+        return ""
+
+    def clean_filename(self, name: str) -> str:
+        """清理文件名中的非法字符"""
+        return re.sub(r'[\\/*?:"<>|]', "", name).strip()[:50]
+
+    def markdown_to_docx(self, markdown_text: str) -> Document:
+        """
+        将 Markdown 文本转换为 Word 文档
+        支持：标题、段落、粗体、斜体、代码块、列表、表格、链接
+        """
+        doc = Document()
+
+        # 设置默认中文字体
+        self.set_document_default_font(doc)
+
+        lines = markdown_text.split("\n")
+        i = 0
+        in_code_block = False
+        code_block_content = []
+        code_block_lang = ""
+        in_list = False
+        list_items = []
+        list_type = None  # 'ordered' or 'unordered'
+
+        while i < len(lines):
+            line = lines[i]
+
+            # 处理代码块
+            if line.strip().startswith("```"):
+                if not in_code_block:
+                    # 先处理之前积累的列表
+                    if in_list and list_items:
+                        self.add_list_to_doc(doc, list_items, list_type)
+                        list_items = []
+                        in_list = False
+
+                    in_code_block = True
+                    code_block_lang = line.strip()[3:].strip()
+                    code_block_content = []
+                else:
+                    # 代码块结束
+                    in_code_block = False
+                    self.add_code_block(
+                        doc, "\n".join(code_block_content), code_block_lang
+                    )
+                    code_block_content = []
+                    code_block_lang = ""
+                i += 1
+                continue
+
+            if in_code_block:
+                code_block_content.append(line)
+                i += 1
+                continue
+
+            # 处理表格
+            if line.strip().startswith("|") and line.strip().endswith("|"):
+                # 先处理之前积累的列表
+                if in_list and list_items:
+                    self.add_list_to_doc(doc, list_items, list_type)
+                    list_items = []
+                    in_list = False
+
+                table_lines = []
+                while i < len(lines) and lines[i].strip().startswith("|"):
+                    table_lines.append(lines[i])
+                    i += 1
+                self.add_table(doc, table_lines)
+                continue
+
+            # 处理标题
+            header_match = re.match(r"^(#{1,6})\s+(.+)$", line.strip())
+            if header_match:
+                # 先处理之前积累的列表
+                if in_list and list_items:
+                    self.add_list_to_doc(doc, list_items, list_type)
+                    list_items = []
+                    in_list = False
+
+                level = len(header_match.group(1))
+                text = header_match.group(2)
+                self.add_heading(doc, text, level)
+                i += 1
+                continue
+
+            # 处理无序列表
+            unordered_match = re.match(r"^(\s*)[-*+]\s+(.+)$", line)
+            if unordered_match:
+                if not in_list or list_type != "unordered":
+                    if in_list and list_items:
+                        self.add_list_to_doc(doc, list_items, list_type)
+                        list_items = []
+                    in_list = True
+                    list_type = "unordered"
+                indent = len(unordered_match.group(1)) // 2
+                list_items.append((indent, unordered_match.group(2)))
+                i += 1
+                continue
+
+            # 处理有序列表
+            ordered_match = re.match(r"^(\s*)\d+[.)]\s+(.+)$", line)
+            if ordered_match:
+                if not in_list or list_type != "ordered":
+                    if in_list and list_items:
+                        self.add_list_to_doc(doc, list_items, list_type)
+                        list_items = []
+                    in_list = True
+                    list_type = "ordered"
+                indent = len(ordered_match.group(1)) // 2
+                list_items.append((indent, ordered_match.group(2)))
+                i += 1
+                continue
+
+            # 处理水平分割线
+            if re.match(r"^[-*_]{3,}$", line.strip()):
+                # 先处理之前积累的列表
+                if in_list and list_items:
+                    self.add_list_to_doc(doc, list_items, list_type)
+                    list_items = []
+                    in_list = False
+
+                self.add_horizontal_rule(doc)
+                i += 1
+                continue
+
+            # 处理空行
+            if not line.strip():
+                # 列表结束
+                if in_list and list_items:
+                    self.add_list_to_doc(doc, list_items, list_type)
+                    list_items = []
+                    in_list = False
+                i += 1
+                continue
+
+            # 处理普通段落
+            if in_list and list_items:
+                self.add_list_to_doc(doc, list_items, list_type)
+                list_items = []
+                in_list = False
+
+            self.add_paragraph(doc, line)
+            i += 1
+
+        # 处理剩余的列表
+        if in_list and list_items:
+            self.add_list_to_doc(doc, list_items, list_type)
+
+        return doc
+
+    def set_document_default_font(self, doc: Document):
+        """设置文档默认字体，确保中英文都正常显示"""
+        # 设置正文样式
+        style = doc.styles["Normal"]
+        font = style.font
+        font.name = "Times New Roman"  # 英文字体
+        font.size = Pt(11)
+
+        # 设置中文字体
+        style._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
+
+        # 设置段落格式
+        paragraph_format = style.paragraph_format
+        paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
+        paragraph_format.space_after = Pt(6)
+
+    def add_heading(self, doc: Document, text: str, level: int):
+        """添加标题"""
+        # Word 标题级别从 0 开始，Markdown 从 1 开始
+        heading_level = min(level, 9)  # Word 最多支持 Heading 9
+        heading = doc.add_heading(level=heading_level)
+
+        # 解析并添加格式化文本
+        self.add_formatted_text(heading, text)
+
+        # 设置中文字体
+        for run in heading.runs:
+            run.font.name = "Times New Roman"
+            run._element.rPr.rFonts.set(qn("w:eastAsia"), "黑体")
+
+    def add_paragraph(self, doc: Document, text: str):
+        """添加段落，支持内联格式"""
+        paragraph = doc.add_paragraph()
+        self.add_formatted_text(paragraph, text)
+
+        # 设置中文字体
+        for run in paragraph.runs:
+            run.font.name = "Times New Roman"
+            run._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
+
+    def add_formatted_text(self, paragraph, text: str):
+        """
+        解析 Markdown 内联格式并添加到段落
+        支持：粗体、斜体、行内代码、链接、删除线
+        """
+        # 定义格式化模式
+        patterns = [
+            # 粗斜体 ***text*** 或 ___text___
+            (r"\*\*\*(.+?)\*\*\*|___(.+?)___", {"bold": True, "italic": True}),
+            # 粗体 **text** 或 __text__
+            (r"\*\*(.+?)\*\*|__(.+?)__", {"bold": True}),
+            # 斜体 *text* 或 _text_
+            (
+                r"(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)|(?<!_)_(?!_)(.+?)(?<!_)_(?!_)",
+                {"italic": True},
+            ),
+            # 行内代码 `code`
+            (r"`([^`]+)`", {"code": True}),
+            # 链接 [text](url)
+            (r"\[([^\]]+)\]\(([^)]+)\)", {"link": True}),
+            # 删除线 ~~text~~
+            (r"~~(.+?)~~", {"strike": True}),
+        ]
+
+        # 简化处理：逐段解析
+        remaining = text
+        last_end = 0
+
+        # 合并所有匹配项
+        all_matches = []
+
+        for pattern, style in patterns:
+            for match in re.finditer(pattern, text):
+                # 获取匹配的文本内容
+                groups = match.groups()
+                matched_text = next((g for g in groups if g is not None), "")
+                all_matches.append(
+                    {
+                        "start": match.start(),
+                        "end": match.end(),
+                        "text": matched_text,
+                        "style": style,
+                        "full_match": match.group(0),
+                        "url": (
+                            groups[1] if style.get("link") and len(groups) > 1 else None
+                        ),
+                    }
+                )
+
+        # 按位置排序
+        all_matches.sort(key=lambda x: x["start"])
+
+        # 移除重叠的匹配
+        filtered_matches = []
+        last_end = 0
+        for m in all_matches:
+            if m["start"] >= last_end:
+                filtered_matches.append(m)
+                last_end = m["end"]
+
+        # 构建最终文本
+        pos = 0
+        for match in filtered_matches:
+            # 添加匹配前的普通文本
+            if match["start"] > pos:
+                plain_text = text[pos : match["start"]]
+                if plain_text:
+                    paragraph.add_run(plain_text)
+
+            # 添加格式化文本
+            style = match["style"]
+            run_text = match["text"]
+
+            if style.get("link"):
+                # 链接处理
+                run = paragraph.add_run(run_text)
+                run.font.color.rgb = RGBColor(0, 0, 255)
+                run.font.underline = True
+            elif style.get("code"):
+                # 行内代码
+                run = paragraph.add_run(run_text)
+                run.font.name = "Consolas"
+                run._element.rPr.rFonts.set(qn("w:eastAsia"), "SimHei")
+                run.font.size = Pt(10)
+                # 添加背景色
+                shading = OxmlElement("w:shd")
+                shading.set(qn("w:fill"), "E8E8E8")
+                run._element.rPr.append(shading)
+            else:
+                run = paragraph.add_run(run_text)
+                if style.get("bold"):
+                    run.bold = True
+                if style.get("italic"):
+                    run.italic = True
+                if style.get("strike"):
+                    run.font.strike = True
+
+            pos = match["end"]
+
+        # 添加剩余的普通文本
+        if pos < len(text):
+            paragraph.add_run(text[pos:])
+
+    def add_code_block(self, doc: Document, code: str, language: str = ""):
+        """添加代码块"""
+        # 添加代码块段落
+        paragraph = doc.add_paragraph()
+        paragraph.paragraph_format.left_indent = Cm(0.5)
+        paragraph.paragraph_format.space_before = Pt(6)
+        paragraph.paragraph_format.space_after = Pt(6)
+
+        # 设置代码块背景
+        run = paragraph.add_run(code)
+        run.font.name = "Consolas"
+        run._element.rPr.rFonts.set(qn("w:eastAsia"), "SimHei")
+        run.font.size = Pt(10)
+
+        # 添加浅灰色背景
+        shading = OxmlElement("w:shd")
+        shading.set(qn("w:fill"), "F5F5F5")
+        paragraph._element.pPr.append(shading)
+
+    def add_table(self, doc: Document, table_lines: List[str]):
+        """添加表格"""
+        if len(table_lines) < 2:
+            return
+
+        # 解析表格数据
+        rows = []
+        for line in table_lines:
+            cells = [cell.strip() for cell in line.strip().strip("|").split("|")]
+            # 跳过分隔行
+            if all(re.fullmatch(r"[-:]+", cell) for cell in cells):
+                continue
+            rows.append(cells)
+
+        if not rows:
+            return
+
+        # 确定列数
+        num_cols = max(len(row) for row in rows)
+
+        # 创建表格
+        table = doc.add_table(rows=len(rows), cols=num_cols)
+        table.style = "Table Grid"
+        table.alignment = WD_TABLE_ALIGNMENT.CENTER
+
+        # 填充表格
+        for row_idx, row_data in enumerate(rows):
+            row = table.rows[row_idx]
+            for col_idx, cell_text in enumerate(row_data):
+                if col_idx < num_cols:
+                    cell = row.cells[col_idx]
+                    # 清除默认段落
+                    cell.paragraphs[0].clear()
+                    self.add_formatted_text(cell.paragraphs[0], cell_text)
+
+                    # 设置单元格字体
+                    for run in cell.paragraphs[0].runs:
+                        run.font.name = "Times New Roman"
+                        run._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
+                        run.font.size = Pt(10)
+
+                    # 表头加粗
+                    if row_idx == 0:
+                        for run in cell.paragraphs[0].runs:
+                            run.bold = True
+
+        # 设置表格列宽度自适应
+        for row in table.rows:
+            for cell in row.cells:
+                cell.paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER
+
+    def add_list_to_doc(
+        self, doc: Document, items: List[Tuple[int, str]], list_type: str
+    ):
+        """添加列表"""
+        for indent, text in items:
+            paragraph = doc.add_paragraph()
+
+            if list_type == "unordered":
+                # 无序列表使用项目符号
+                paragraph.style = "List Bullet"
+            else:
+                # 有序列表使用编号
+                paragraph.style = "List Number"
+
+            # 设置缩进
+            paragraph.paragraph_format.left_indent = Cm(0.5 * (indent + 1))
+
+            # 添加格式化文本
+            self.add_formatted_text(paragraph, text)
+
+            # 设置字体
+            for run in paragraph.runs:
+                run.font.name = "Times New Roman"
+                run._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
+
+    def add_horizontal_rule(self, doc: Document):
+        """添加水平分割线"""
+        paragraph = doc.add_paragraph()
+        paragraph.paragraph_format.space_before = Pt(12)
+        paragraph.paragraph_format.space_after = Pt(12)
+
+        # 添加底部边框作为分割线
+        pPr = paragraph._element.get_or_add_pPr()
+        pBdr = OxmlElement("w:pBdr")
+        bottom = OxmlElement("w:bottom")
+        bottom.set(qn("w:val"), "single")
+        bottom.set(qn("w:sz"), "6")
+        bottom.set(qn("w:space"), "1")
+        bottom.set(qn("w:color"), "auto")
+        pBdr.append(bottom)
+        pPr.append(pBdr)