feat: enhance markdown normalizer with mermaid fix and frontend logging

2026-01-10 15:45:20 +08:00
parent 5fa56ba88d
commit ef34cc326c
4 changed files with 1294 additions and 1 deletions
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -260,7 +260,46 @@ async def _emit_notification(
 ## 📋 日志规范 (Logging Standard)
- **禁止使用** `print()` 语句
+### 1. 前端控制台调试 (Frontend Console Debugging) - **优先推荐 (Preferred)**
 对于需要实时查看数据流、排查 UI 交互或内容变更的场景，**优先使用**前端控制台日志。这种方式可以直接在浏览器 DevTools (F12) 中查看，无需访问服务端日志。
 **实现方式**: 通过 `__event_emitter__` 发送 `type: "execute"` 事件执行 JS 代码。
 ```python
 import json
 async def _emit_debug_log(self, __event_emitter__, title: str, data: dict):
    """在浏览器控制台打印结构化调试日志"""
    if not self.valves.show_debug_log or not __event_emitter__:
        return
    try:
        js_code = f"""
            (async function() {{
                console.group("🛠️ {title}");
                console.log({json.dumps(data, ensure_ascii=False)});
                console.groupEnd();
            }})();
        """
        await __event_emitter__({
            "type": "execute",
            "data": {"code": js_code}
        })
    except Exception as e:
        print(f"Error emitting debug log: {e}")
 ```
 **配置要求**:
 - 在 `Valves` 中添加 `show_debug_log: bool` 开关，默认关闭。
 - 仅在开关开启时发送日志。
 ### 2. 服务端日志 (Server-side Logging)
 用于记录系统级错误、异常堆栈或无需前端感知的后台任务。
 - **禁止使用** `print()` 语句 (除非用于简单的脚本调试)
 - 必须使用 Python 标准库 `logging`
 ```python
--- a/plugins/filters/markdown_normalizer/markdown_normalizer.py
+++ b/plugins/filters/markdown_normalizer/markdown_normalizer.py
@@ -0,0 +1,519 @@
 """
 title: Markdown Normalizer
 author: Fu-Jie
 author_url: https://github.com/Fu-Jie
 funding_url: https://github.com/Fu-Jie/awesome-openwebui
 version: 1.0.0
 description: A production-grade content normalizer filter that fixes common Markdown formatting issues in LLM outputs, such as broken code blocks, LaTeX formulas, and list formatting.
 """
 from pydantic import BaseModel, Field
 from typing import Optional, List, Callable
 import re
 import logging
 import logging
 import asyncio
 import json
 from dataclasses import dataclass, field
 # Configure logging
 logger = logging.getLogger(__name__)
@dataclass
 class NormalizerConfig:
    """Configuration class for enabling/disabling specific normalization rules"""
    enable_escape_fix: bool = True  # Fix excessive escape characters
    enable_thought_tag_fix: bool = True  # Normalize thought tags
    enable_code_block_fix: bool = True  # Fix code block formatting
    enable_latex_fix: bool = True  # Fix LaTeX formula formatting
    enable_list_fix: bool = (
        False  # Fix list item newlines (default off as it can be aggressive)
    )
    enable_unclosed_block_fix: bool = True  # Auto-close unclosed code blocks
    enable_fullwidth_symbol_fix: bool = False  # Fix full-width symbols in code blocks
    enable_mermaid_fix: bool = True  # Fix common Mermaid syntax errors
    enable_heading_fix: bool = (
        True  # Fix missing space in headings (#Header -> # Header)
    )
    enable_table_fix: bool = True  # Fix missing closing pipe in tables
    enable_xml_tag_cleanup: bool = True  # Cleanup leftover XML tags
    # Custom cleaner functions (for advanced extension)
    custom_cleaners: List[Callable[[str], str]] = field(default_factory=list)
 class ContentNormalizer:
    """LLM Output Content Normalizer - Production Grade Implementation"""
    # --- 1. Pre-compiled Regex Patterns (Performance Optimization) ---
    _PATTERNS = {
        # Code block prefix: if ``` is not at start of line or file
        "code_block_prefix": re.compile(r"(?<!^)(?<!\n)(```)", re.MULTILINE),
        # Code block suffix: ```lang followed by non-whitespace (no newline)
        "code_block_suffix": re.compile(r"(```[\w\+\-\.]*)[ \t]+([^\n\r])"),
        # Code block indent: whitespace at start of line + ```
        "code_block_indent": re.compile(r"^[ \t]+(```)", re.MULTILINE),
        # Thought tag: </thought> followed by optional whitespace/newlines
        "thought_end": re.compile(
            r"</(thought|think|thinking)>[ \t]*\n*", re.IGNORECASE
        ),
        "thought_start": re.compile(r"<(thought|think|thinking)>", re.IGNORECASE),
        # LaTeX block: \[ ... \]
        "latex_bracket_block": re.compile(r"\\\[(.+?)\\\]", re.DOTALL),
        # LaTeX inline: \( ... \)
        "latex_paren_inline": re.compile(r"\\\((.+?)\\\)"),
        # List item: non-newline + digit + dot + space
        "list_item": re.compile(r"([^\n])(\d+\. )"),
        # XML artifacts (e.g. Claude's)
        "xml_artifacts": re.compile(
            r"</?(?:antArtifact|antThinking|artifact)[^>]*>", re.IGNORECASE
        ),
        # Mermaid: Match various node shapes and quote unquoted labels
        # Fix "reverse optimization": Must precisely match shape delimiters to avoid breaking structure
        # Priority: Longer delimiters match first
        "mermaid_node": re.compile(
            r"(\w+)\s*(?:"
            r"(\(\(\()(?![\"])(.*?)(?<![\"])(\)\)\))|"  # (((...))) Double Circle
            r"(\(\()(?![\"])(.*?)(?<![\"])(\)\))|"  # ((...)) Circle
            r"(\(\[)(?![\"])(.*?)(?<![\"])(\]\))|"  # ([...]) Stadium
            r"(\[\()(?![\"])(.*?)(?<![\"])(\)\])|"  # [(...)] Cylinder
            r"(\[\[)(?![\"])(.*?)(?<![\"])(\]\])|"  # [[...]] Subroutine
            r"(\{\{)(?![\"])(.*?)(?<![\"])(\}\})|"  # {{...}} Hexagon
            r"(\[/)(?![\"])(.*?)(?<![\"])(/\])|"  # [/.../] Parallelogram
            r"(\[\\)(?![\"])(.*?)(?<![\"])(\\\])|"  # [\...\] Parallelogram Alt
            r"(\[/)(?![\"])(.*?)(?<![\"])(\\\])|"  # [/...\] Trapezoid
            r"(\[\\)(?![\"])(.*?)(?<![\"])(/\])|"  # [\.../] Trapezoid Alt
            r"(\()(?![\"])(.*?)(?<![\"])(\))|"  # (...) Round
            r"(\[)(?![\"])(.*?)(?<![\"])(\])|"  # [...] Square
            r"(\{)(?![\"])(.*?)(?<![\"])(\})|"  # {...} Rhombus
            r"(>)(?![\"])(.*?)(?<![\"])(\])"  # >...] Asymmetric
            r")"
        ),
        # Heading: #Heading -> # Heading
        "heading_space": re.compile(r"^(#+)([^ \n#])", re.MULTILINE),
        # Table: | col1 | col2 -> | col1 | col2 |
        "table_pipe": re.compile(r"^(\|.*[^|\n])$", re.MULTILINE),
    }
    def __init__(self, config: Optional[NormalizerConfig] = None):
        self.config = config or NormalizerConfig()
        self.applied_fixes = []
    def normalize(self, content: str) -> str:
        """Main entry point: apply all normalization rules in order"""
        self.applied_fixes = []
        if not content:
            return content
        original_content = content  # Keep a copy for logging
        try:
            # 1. Escape character fix (Must be first)
            if self.config.enable_escape_fix:
                original = content
                content = self._fix_escape_characters(content)
                if content != original:
                    self.applied_fixes.append("Fix Escape Chars")
            # 2. Thought tag normalization
            if self.config.enable_thought_tag_fix:
                original = content
                content = self._fix_thought_tags(content)
                if content != original:
                    self.applied_fixes.append("Normalize Thought Tags")
            # 3. Code block formatting fix
            if self.config.enable_code_block_fix:
                original = content
                content = self._fix_code_blocks(content)
                if content != original:
                    self.applied_fixes.append("Fix Code Blocks")
            # 4. LaTeX formula normalization
            if self.config.enable_latex_fix:
                original = content
                content = self._fix_latex_formulas(content)
                if content != original:
                    self.applied_fixes.append("Normalize LaTeX")
            # 5. List formatting fix
            if self.config.enable_list_fix:
                original = content
                content = self._fix_list_formatting(content)
                if content != original:
                    self.applied_fixes.append("Fix List Format")
            # 6. Unclosed code block fix
            if self.config.enable_unclosed_block_fix:
                original = content
                content = self._fix_unclosed_code_blocks(content)
                if content != original:
                    self.applied_fixes.append("Close Code Blocks")
            # 7. Full-width symbol fix (in code blocks only)
            if self.config.enable_fullwidth_symbol_fix:
                original = content
                content = self._fix_fullwidth_symbols_in_code(content)
                if content != original:
                    self.applied_fixes.append("Fix Full-width Symbols")
            # 8. Mermaid syntax fix
            if self.config.enable_mermaid_fix:
                original = content
                content = self._fix_mermaid_syntax(content)
                if content != original:
                    self.applied_fixes.append("Fix Mermaid Syntax")
            # 9. Heading fix
            if self.config.enable_heading_fix:
                original = content
                content = self._fix_headings(content)
                if content != original:
                    self.applied_fixes.append("Fix Headings")
            # 10. Table fix
            if self.config.enable_table_fix:
                original = content
                content = self._fix_tables(content)
                if content != original:
                    self.applied_fixes.append("Fix Tables")
            # 11. XML tag cleanup
            if self.config.enable_xml_tag_cleanup:
                original = content
                content = self._cleanup_xml_tags(content)
                if content != original:
                    self.applied_fixes.append("Cleanup XML Tags")
            # 9. Custom cleaners
            for cleaner in self.config.custom_cleaners:
                original = content
                content = cleaner(content)
                if content != original:
                    self.applied_fixes.append("Custom Cleaner")
            if self.applied_fixes:
                logger.info(f"Markdown Normalizer Applied Fixes: {self.applied_fixes}")
                logger.debug(
                    f"--- Original Content ---\n{original_content}\n------------------------"
                )
                logger.debug(
                    f"--- Normalized Content ---\n{content}\n--------------------------"
                )
            return content
        except Exception as e:
            # Production safeguard: return original content on error
            logger.error(f"Content normalization failed: {e}", exc_info=True)
            return content
    def _fix_escape_characters(self, content: str) -> str:
        """Fix excessive escape characters"""
        content = content.replace("\\r\\n", "\n")
        content = content.replace("\\n", "\n")
        content = content.replace("\\t", "\t")
        content = content.replace("\\\\", "\\")
        return content
    def _fix_thought_tags(self, content: str) -> str:
        """Normalize thought tags: unify naming and fix spacing"""
        # 1. Standardize start tag: <think>, <thinking> -> <thought>
        content = self._PATTERNS["thought_start"].sub("<thought>", content)
        # 2. Standardize end tag and ensure newlines: </think> -> </thought>\n\n
        return self._PATTERNS["thought_end"].sub("</thought>\n\n", content)
    def _fix_code_blocks(self, content: str) -> str:
        """Fix code block formatting (prefixes, suffixes, indentation)"""
        # Remove indentation before code blocks
        content = self._PATTERNS["code_block_indent"].sub(r"\1", content)
        # Ensure newline before ```
        content = self._PATTERNS["code_block_prefix"].sub(r"\n\1", content)
        # Ensure newline after ```lang
        content = self._PATTERNS["code_block_suffix"].sub(r"\1\n\2", content)
        return content
    def _fix_latex_formulas(self, content: str) -> str:
        """Normalize LaTeX formulas: \[ -> $$ (block), \( -> $ (inline)"""
        content = self._PATTERNS["latex_bracket_block"].sub(r"$$\1$$", content)
        content = self._PATTERNS["latex_paren_inline"].sub(r"$\1$", content)
        return content
    def _fix_list_formatting(self, content: str) -> str:
        """Fix missing newlines in lists (e.g., 'text1. item' -> 'text\\n1. item')"""
        return self._PATTERNS["list_item"].sub(r"\1\n\2", content)
    def _fix_unclosed_code_blocks(self, content: str) -> str:
        """Auto-close unclosed code blocks"""
        if content.count("```") % 2 != 0:
            content += "\n```"
        return content
    def _fix_fullwidth_symbols_in_code(self, content: str) -> str:
        """Convert full-width symbols to half-width inside code blocks"""
        FULLWIDTH_MAP = {
            "，": ",",
            "。": ".",
            "（": "(",
            "）": ")",
            "【": "[",
            "】": "]",
            "；": ";",
            "：": ":",
            "？": "?",
            "！": "!",
            '"': '"',
            '"': '"',
            """: "'", """: "'",
        }
        parts = content.split("```")
        # Code block content is at odd indices: 1, 3, 5...
        for i in range(1, len(parts), 2):
            for full, half in FULLWIDTH_MAP.items():
                parts[i] = parts[i].replace(full, half)
        return "```".join(parts)
    def _fix_mermaid_syntax(self, content: str) -> str:
        """Fix common Mermaid syntax errors while preserving node shapes"""
        def replacer(match):
            # Group 1 is ID
            id_str = match.group(1)
            # Find matching shape group
            # Groups start at index 2, each shape has 3 groups (Open, Content, Close)
            # We iterate to find the non-None one
            groups = match.groups()
            for i in range(1, len(groups), 3):
                if groups[i] is not None:
                    open_char = groups[i]
                    content = groups[i + 1]
                    close_char = groups[i + 2]
                    # Escape quotes in content
                    content = content.replace('"', '\\"')
                    return f'{id_str}{open_char}"{content}"{close_char}'
            return match.group(0)
        parts = content.split("```")
        for i in range(1, len(parts), 2):
            # Check if it's a mermaid block
            lang_line = parts[i].split("\n", 1)[0].strip().lower()
            if "mermaid" in lang_line:
                # Apply the comprehensive regex fix
                parts[i] = self._PATTERNS["mermaid_node"].sub(replacer, parts[i])
                # Auto-close subgraphs
                subgraph_count = len(
                    re.findall(r"\bsubgraph\b", parts[i], re.IGNORECASE)
                )
                end_count = len(re.findall(r"\bend\b", parts[i], re.IGNORECASE))
                if subgraph_count > end_count:
                    missing_ends = subgraph_count - end_count
                    parts[i] = parts[i].rstrip() + ("\n    end" * missing_ends) + "\n"
        return "```".join(parts)
    def _fix_headings(self, content: str) -> str:
        """Fix missing space in headings: #Heading -> # Heading"""
        # We only fix if it's not inside a code block.
        # But splitting by code block is expensive.
        # Given headings usually don't appear inside code blocks without space in valid code (except comments),
        # we might risk false positives in comments like `#TODO`.
        # To be safe, let's split by code blocks.
        parts = content.split("```")
        for i in range(0, len(parts), 2):  # Even indices are markdown text
            parts[i] = self._PATTERNS["heading_space"].sub(r"\1 \2", parts[i])
        return "```".join(parts)
    def _fix_tables(self, content: str) -> str:
        """Fix tables missing closing pipe"""
        parts = content.split("```")
        for i in range(0, len(parts), 2):
            parts[i] = self._PATTERNS["table_pipe"].sub(r"\1|", parts[i])
        return "```".join(parts)
    def _cleanup_xml_tags(self, content: str) -> str:
        """Remove leftover XML tags"""
        return self._PATTERNS["xml_artifacts"].sub("", content)
 class Filter:
    class Valves(BaseModel):
        priority: int = Field(
            default=50,
            description="Priority level. Higher runs later (recommended to run after other filters).",
        )
        enable_escape_fix: bool = Field(
            default=True, description="Fix excessive escape characters (\\n, \\t, etc.)"
        )
        enable_thought_tag_fix: bool = Field(
            default=True, description="Normalize </thought> tags"
        )
        enable_code_block_fix: bool = Field(
            default=True,
            description="Fix code block formatting (indentation, newlines)",
        )
        enable_latex_fix: bool = Field(
            default=True, description="Normalize LaTeX formulas (\\[ -> $$, \\( -> $)"
        )
        enable_list_fix: bool = Field(
            default=False, description="Fix list item newlines (Experimental)"
        )
        enable_unclosed_block_fix: bool = Field(
            default=True, description="Auto-close unclosed code blocks"
        )
        enable_fullwidth_symbol_fix: bool = Field(
            default=False, description="Fix full-width symbols in code blocks"
        )
        enable_mermaid_fix: bool = Field(
            default=True,
            description="Fix common Mermaid syntax errors (e.g. unquoted labels)",
        )
        enable_heading_fix: bool = Field(
            default=True,
            description="Fix missing space in headings (#Header -> # Header)",
        )
        enable_table_fix: bool = Field(
            default=True, description="Fix missing closing pipe in tables"
        )
        enable_xml_tag_cleanup: bool = Field(
            default=True, description="Cleanup leftover XML tags"
        )
        show_status: bool = Field(
            default=True, description="Show status notification when fixes are applied"
        )
        show_debug_log: bool = Field(
            default=False, description="Print debug logs to browser console (F12)"
        )
    def __init__(self):
        self.valves = self.Valves()
    def _contains_html(self, content: str) -> bool:
        """Check if content contains HTML tags (to avoid breaking HTML output)"""
        pattern = r"<\s*/?\s*(?:html|head|body|div|span|p|br|hr|ul|ol|li|table|thead|tbody|tfoot|tr|td|th|img|a|b|i|strong|em|code|pre|blockquote|h[1-6]|script|style|form|input|button|label|select|option|iframe|link|meta|title)\b"
        return bool(re.search(pattern, content, re.IGNORECASE))
    async def _emit_status(self, __event_emitter__, applied_fixes: List[str]):
        """Emit status notification"""
        if not self.valves.show_status or not applied_fixes:
            return
        description = "✓ Markdown Normalized"
        if applied_fixes:
            description += f": {', '.join(applied_fixes)}"
        try:
            await __event_emitter__(
                {
                    "type": "status",
                    "data": {
                        "description": description,
                        "done": True,
                    },
                }
            )
        except Exception as e:
            print(f"Error emitting status: {e}")
    async def _emit_debug_log(
        self, __event_call__, applied_fixes: List[str], original: str, normalized: str
    ):
        """Emit debug log to browser console via JS execution"""
        if not self.valves.show_debug_log or not __event_call__:
            return
        try:
            # Prepare data for JS
            log_data = {
                "fixes": applied_fixes,
                "original": original,
                "normalized": normalized,
            }
            # Construct JS code
            js_code = f"""
                (async function() {{
                    console.group("🛠️ Markdown Normalizer Debug");
                    console.log("Applied Fixes:", {json.dumps(applied_fixes, ensure_ascii=False)});
                    console.log("Original Content:", {json.dumps(original, ensure_ascii=False)});
                    console.log("Normalized Content:", {json.dumps(normalized, ensure_ascii=False)});
                    console.groupEnd();
                }})();
            """
            await __event_call__(
                {
                    "type": "execute",
                    "data": {"code": js_code},
                }
            )
        except Exception as e:
            print(f"Error emitting debug log: {e}")
    async def outlet(
        self,
        body: dict,
        __user__: Optional[dict] = None,
        __event_emitter__=None,
        __event_call__=None,
        __metadata__: Optional[dict] = None,
    ) -> dict:
        """
        Process the response body to normalize Markdown content.
        """
        if "messages" in body and body["messages"]:
            last = body["messages"][-1]
            content = last.get("content", "") or ""
            if last.get("role") == "assistant" and isinstance(content, str):
                # Skip if content looks like HTML to avoid breaking it
                if self._contains_html(content):
                    return body
                # Configure normalizer based on valves
                config = NormalizerConfig(
                    enable_escape_fix=self.valves.enable_escape_fix,
                    enable_thought_tag_fix=self.valves.enable_thought_tag_fix,
                    enable_code_block_fix=self.valves.enable_code_block_fix,
                    enable_latex_fix=self.valves.enable_latex_fix,
                    enable_list_fix=self.valves.enable_list_fix,
                    enable_unclosed_block_fix=self.valves.enable_unclosed_block_fix,
                    enable_fullwidth_symbol_fix=self.valves.enable_fullwidth_symbol_fix,
                    enable_mermaid_fix=self.valves.enable_mermaid_fix,
                    enable_heading_fix=self.valves.enable_heading_fix,
                    enable_table_fix=self.valves.enable_table_fix,
                    enable_xml_tag_cleanup=self.valves.enable_xml_tag_cleanup,
                )
                normalizer = ContentNormalizer(config)
                # Execute normalization
                new_content = normalizer.normalize(content)
                # Update content if changed
                if new_content != content:
                    last["content"] = new_content
                    # Emit status if enabled
                    if __event_emitter__:
                        await self._emit_status(
                            __event_emitter__, normalizer.applied_fixes
                        )
                        await self._emit_debug_log(
                            __event_call__,
                            normalizer.applied_fixes,
                            content,
                            new_content,
                        )
        return body
--- a/plugins/filters/markdown_normalizer/markdown_normalizer_cn.py
+++ b/plugins/filters/markdown_normalizer/markdown_normalizer_cn.py
@@ -0,0 +1,544 @@
 """
 title: Markdown 格式修复器 (Markdown Normalizer)
 author: Fu-Jie
 author_url: https://github.com/Fu-Jie
 funding_url: https://github.com/Fu-Jie/awesome-openwebui
 version: 1.0.0
 description: 生产级内容规范化过滤器，修复 LLM 输出中常见的 Markdown 格式问题，如损坏的代码块、LaTeX 公式、Mermaid 图表和列表格式。
 """
 from pydantic import BaseModel, Field
 from typing import Optional, List, Callable
 import re
 import logging
 import asyncio
 import json
 from dataclasses import dataclass, field
 # Configure logging
 logger = logging.getLogger(__name__)
@dataclass
 class NormalizerConfig:
    """配置类，用于启用/禁用特定的规范化规则"""
    enable_escape_fix: bool = True  # 修复过度的转义字符
    enable_thought_tag_fix: bool = True  # 规范化思维链标签
    enable_code_block_fix: bool = True  # 修复代码块格式
    enable_latex_fix: bool = True  # 修复 LaTeX 公式格式
    enable_list_fix: bool = False  # 修复列表项换行 (默认关闭，因为可能过于激进)
    enable_unclosed_block_fix: bool = True  # 自动闭合未闭合的代码块
    enable_fullwidth_symbol_fix: bool = False  # 修复代码块中的全角符号
    enable_mermaid_fix: bool = True  # 修复常见的 Mermaid 语法错误
    enable_heading_fix: bool = True  # 修复标题中缺失的空格 (#Header -> # Header)
    enable_table_fix: bool = True  # 修复表格中缺失的闭合管道符
    enable_xml_tag_cleanup: bool = True  # 清理残留的 XML 标签
    # 自定义清理函数 (用于高级扩展)
    custom_cleaners: List[Callable[[str], str]] = field(default_factory=list)
 class ContentNormalizer:
    """LLM Output Content Normalizer - Production Grade Implementation"""
    # --- 1. Pre-compiled Regex Patterns (Performance Optimization) ---
    _PATTERNS = {
        # Code block prefix: if ``` is not at start of line or file
        "code_block_prefix": re.compile(r"(?<!^)(?<!\n)(```)", re.MULTILINE),
        # Code block suffix: ```lang followed by non-whitespace (no newline)
        "code_block_suffix": re.compile(r"(```[\w\+\-\.]*)[ \t]+([^\n\r])"),
        # Code block indent: whitespace at start of line + ```
        "code_block_indent": re.compile(r"^[ \t]+(```)", re.MULTILINE),
        # Thought tag: </thought> followed by optional whitespace/newlines
        "thought_end": re.compile(
            r"</(thought|think|thinking)>[ \t]*\n*", re.IGNORECASE
        ),
        "thought_start": re.compile(r"<(thought|think|thinking)>", re.IGNORECASE),
        # LaTeX block: \[ ... \]
        "latex_bracket_block": re.compile(r"\\\[(.+?)\\\]", re.DOTALL),
        # LaTeX inline: \( ... \)
        "latex_paren_inline": re.compile(r"\\\((.+?)\\\)"),
        # List item: non-newline + digit + dot + space
        "list_item": re.compile(r"([^\n])(\d+\. )"),
        # XML artifacts (e.g. Claude's)
        "xml_artifacts": re.compile(
            r"</?(?:antArtifact|antThinking|artifact)[^>]*>", re.IGNORECASE
        ),
        # Mermaid: 匹配各种形状的节点并为未加引号的标签添加引号
        # 修复"反向优化"问题：必须精确匹配各种形状的定界符，避免破坏形状结构
        # 优先级：长定界符优先匹配
        "mermaid_node": re.compile(
            r"(\w+)\s*(?:"
            r"(\(\(\()(?![\"])(.*?)(?<![\"])(\)\)\))|"  # (((...))) Double Circle
            r"(\(\()(?![\"])(.*?)(?<![\"])(\)\))|"  # ((...)) Circle
            r"(\(\[)(?![\"])(.*?)(?<![\"])(\]\))|"  # ([...]) Stadium
            r"(\[\()(?![\"])(.*?)(?<![\"])(\)\])|"  # [(...)] Cylinder
            r"(\[\[)(?![\"])(.*?)(?<![\"])(\]\])|"  # [[...]] Subroutine
            r"(\{\{)(?![\"])(.*?)(?<![\"])(\}\})|"  # {{...}} Hexagon
            r"(\[/)(?![\"])(.*?)(?<![\"])(/\])|"  # [/.../] Parallelogram
            r"(\[\\)(?![\"])(.*?)(?<![\"])(\\\])|"  # [\...\] Parallelogram Alt
            r"(\[/)(?![\"])(.*?)(?<![\"])(\\\])|"  # [/...\] Trapezoid
            r"(\[\\)(?![\"])(.*?)(?<![\"])(/\])|"  # [\.../] Trapezoid Alt
            r"(\()(?![\"])(.*?)(?<![\"])(\))|"  # (...) Round
            r"(\[)(?![\"])(.*?)(?<![\"])(\])|"  # [...] Square
            r"(\{)(?![\"])(.*?)(?<![\"])(\})|"  # {...} Rhombus
            r"(>)(?![\"])(.*?)(?<![\"])(\])"  # >...] Asymmetric
            r")"
        ),
        # Heading: #Heading -> # Heading
        "heading_space": re.compile(r"^(#+)([^ \n#])", re.MULTILINE),
        # Table: | col1 | col2 -> | col1 | col2 |
        "table_pipe": re.compile(r"^(\|.*[^|\n])$", re.MULTILINE),
    }
    def __init__(self, config: Optional[NormalizerConfig] = None):
        self.config = config or NormalizerConfig()
        self.applied_fixes = []
    def normalize(self, content: str) -> str:
        """Main entry point: apply all normalization rules in order"""
        self.applied_fixes = []
        if not content:
            return content
        original_content = content  # Keep a copy for logging
        try:
            # 1. Escape character fix (Must be first)
            if self.config.enable_escape_fix:
                original = content
                content = self._fix_escape_characters(content)
                if content != original:
                    self.applied_fixes.append("Fix Escape Chars")
            # 2. Thought tag normalization
            if self.config.enable_thought_tag_fix:
                original = content
                content = self._fix_thought_tags(content)
                if content != original:
                    self.applied_fixes.append("Normalize Thought Tags")
            # 3. Code block formatting fix
            if self.config.enable_code_block_fix:
                original = content
                content = self._fix_code_blocks(content)
                if content != original:
                    self.applied_fixes.append("Fix Code Blocks")
            # 4. LaTeX formula normalization
            if self.config.enable_latex_fix:
                original = content
                content = self._fix_latex_formulas(content)
                if content != original:
                    self.applied_fixes.append("Normalize LaTeX")
            # 5. List formatting fix
            if self.config.enable_list_fix:
                original = content
                content = self._fix_list_formatting(content)
                if content != original:
                    self.applied_fixes.append("Fix List Format")
            # 6. Unclosed code block fix
            if self.config.enable_unclosed_block_fix:
                original = content
                content = self._fix_unclosed_code_blocks(content)
                if content != original:
                    self.applied_fixes.append("Close Code Blocks")
            # 7. Full-width symbol fix (in code blocks only)
            if self.config.enable_fullwidth_symbol_fix:
                original = content
                content = self._fix_fullwidth_symbols_in_code(content)
                if content != original:
                    self.applied_fixes.append("Fix Full-width Symbols")
            # 8. Mermaid syntax fix
            if self.config.enable_mermaid_fix:
                original = content
                content = self._fix_mermaid_syntax(content)
                if content != original:
                    self.applied_fixes.append("Fix Mermaid Syntax")
            # 9. Heading fix
            if self.config.enable_heading_fix:
                original = content
                content = self._fix_headings(content)
                if content != original:
                    self.applied_fixes.append("Fix Headings")
            # 10. Table fix
            if self.config.enable_table_fix:
                original = content
                content = self._fix_tables(content)
                if content != original:
                    self.applied_fixes.append("Fix Tables")
            # 11. XML tag cleanup
            if self.config.enable_xml_tag_cleanup:
                original = content
                content = self._cleanup_xml_tags(content)
                if content != original:
                    self.applied_fixes.append("Cleanup XML Tags")
            # 9. Custom cleaners
            for cleaner in self.config.custom_cleaners:
                original = content
                content = cleaner(content)
                if content != original:
                    self.applied_fixes.append("Custom Cleaner")
            if self.applied_fixes:
                print(f"[Markdown Normalizer] Applied fixes: {self.applied_fixes}")
                print(
                    f"[Markdown Normalizer] --- Original Content ---\n{original_content}\n------------------------"
                )
                print(
                    f"[Markdown Normalizer] --- Normalized Content ---\n{content}\n--------------------------"
                )
            return content
        except Exception as e:
            # Production safeguard: return original content on error
            logger.error(f"Content normalization failed: {e}", exc_info=True)
            return content
    def _fix_escape_characters(self, content: str) -> str:
        """Fix excessive escape characters"""
        content = content.replace("\\r\\n", "\n")
        content = content.replace("\\n", "\n")
        content = content.replace("\\t", "\t")
        content = content.replace("\\\\", "\\")
        return content
    def _fix_thought_tags(self, content: str) -> str:
        """Normalize thought tags: unify naming and fix spacing"""
        # 1. Standardize start tag: <think>, <thinking> -> <thought>
        content = self._PATTERNS["thought_start"].sub("<thought>", content)
        # 2. Standardize end tag and ensure newlines: </think> -> </thought>\n\n
        return self._PATTERNS["thought_end"].sub("</thought>\n\n", content)
    def _fix_code_blocks(self, content: str) -> str:
        """Fix code block formatting (prefixes, suffixes, indentation)"""
        # Remove indentation before code blocks
        content = self._PATTERNS["code_block_indent"].sub(r"\1", content)
        # Ensure newline before ```
        content = self._PATTERNS["code_block_prefix"].sub(r"\n\1", content)
        # Ensure newline after ```lang
        content = self._PATTERNS["code_block_suffix"].sub(r"\1\n\2", content)
        return content
    def _fix_latex_formulas(self, content: str) -> str:
        """Normalize LaTeX formulas: \[ -> $$ (block), \( -> $ (inline)"""
        content = self._PATTERNS["latex_bracket_block"].sub(r"$$\1$$", content)
        content = self._PATTERNS["latex_paren_inline"].sub(r"$\1$", content)
        return content
    def _fix_list_formatting(self, content: str) -> str:
        """Fix missing newlines in lists (e.g., 'text1. item' -> 'text\\n1. item')"""
        return self._PATTERNS["list_item"].sub(r"\1\n\2", content)
    def _fix_unclosed_code_blocks(self, content: str) -> str:
        """Auto-close unclosed code blocks"""
        if content.count("```") % 2 != 0:
            content += "\n```"
        return content
    def _fix_fullwidth_symbols_in_code(self, content: str) -> str:
        """Convert full-width symbols to half-width inside code blocks"""
        FULLWIDTH_MAP = {
            "，": ",",
            "。": ".",
            "（": "(",
            "）": ")",
            "【": "[",
            "】": "]",
            "；": ";",
            "：": ":",
            "？": "?",
            "！": "!",
            '"': '"',
            '"': '"',
            """: "'", """: "'",
        }
        parts = content.split("```")
        # Code block content is at odd indices: 1, 3, 5...
        for i in range(1, len(parts), 2):
            for full, half in FULLWIDTH_MAP.items():
                parts[i] = parts[i].replace(full, half)
        return "```".join(parts)
    def _fix_mermaid_syntax(self, content: str) -> str:
        """修复常见的 Mermaid 语法错误，同时保留节点形状"""
        def replacer(match):
            # Group 1 是 ID
            id_str = match.group(1)
            # 查找匹配的形状组
            # 组从索引 2 开始，每个形状有 3 个组 (Open, Content, Close)
            # 我们遍历找到非 None 的那一组
            groups = match.groups()
            for i in range(1, len(groups), 3):
                if groups[i] is not None:
                    open_char = groups[i]
                    content = groups[i + 1]
                    close_char = groups[i + 2]
                    # 如果内容包含引号，进行转义
                    content = content.replace('"', '\\"')
                    return f'{id_str}{open_char}"{content}"{close_char}'
            return match.group(0)
        parts = content.split("```")
        for i in range(1, len(parts), 2):
            # Check if it's a mermaid block
            lang_line = parts[i].split("\n", 1)[0].strip().lower()
            if "mermaid" in lang_line:
                # Apply the comprehensive regex fix
                parts[i] = self._PATTERNS["mermaid_node"].sub(replacer, parts[i])
                # Auto-close subgraphs
                # Count 'subgraph' and 'end' (case-insensitive)
                # We use a simple regex to avoid matching words inside labels (though labels are now quoted, so it's safer)
                # But for simplicity and speed, we just count occurrences in the whole block.
                # A more robust way would be to strip quoted strings first, but that's expensive.
                # Given we just quoted everything, let's try to count keywords outside quotes?
                # Actually, since we just normalized nodes, most text is in quotes.
                # Let's just do a simple count. It's a heuristic fix.
                subgraph_count = len(
                    re.findall(r"\bsubgraph\b", parts[i], re.IGNORECASE)
                )
                end_count = len(re.findall(r"\bend\b", parts[i], re.IGNORECASE))
                if subgraph_count > end_count:
                    missing_ends = subgraph_count - end_count
                    parts[i] = parts[i].rstrip() + ("\n    end" * missing_ends) + "\n"
        return "```".join(parts)
    def _fix_headings(self, content: str) -> str:
        """Fix missing space in headings: #Heading -> # Heading"""
        # We only fix if it's not inside a code block.
        # But splitting by code block is expensive.
        # Given headings usually don't appear inside code blocks without space in valid code (except comments),
        # we might risk false positives in comments like `#TODO`.
        # To be safe, let's split by code blocks.
        parts = content.split("```")
        for i in range(0, len(parts), 2):  # Even indices are markdown text
            parts[i] = self._PATTERNS["heading_space"].sub(r"\1 \2", parts[i])
        return "```".join(parts)
    def _fix_tables(self, content: str) -> str:
        """Fix tables missing closing pipe"""
        parts = content.split("```")
        for i in range(0, len(parts), 2):
            parts[i] = self._PATTERNS["table_pipe"].sub(r"\1|", parts[i])
        return "```".join(parts)
    def _cleanup_xml_tags(self, content: str) -> str:
        """Remove leftover XML tags"""
        return self._PATTERNS["xml_artifacts"].sub("", content)
 class Filter:
    class Valves(BaseModel):
        priority: int = Field(
            default=50,
            description="优先级。数值越高运行越晚 (建议在其他过滤器之后运行)。",
        )
        enable_escape_fix: bool = Field(
            default=True, description="修复过度的转义字符 (\\n, \\t 等)"
        )
        enable_thought_tag_fix: bool = Field(
            default=True, description="规范化思维链标签 (<think> -> <thought>)"
        )
        enable_code_block_fix: bool = Field(
            default=True,
            description="修复代码块格式 (缩进、换行)",
        )
        enable_latex_fix: bool = Field(
            default=True, description="规范化 LaTeX 公式 (\\[ -> $$, \\( -> $)"
        )
        enable_list_fix: bool = Field(
            default=False, description="修复列表项换行 (实验性)"
        )
        enable_unclosed_block_fix: bool = Field(
            default=True, description="自动闭合未闭合的代码块"
        )
        enable_fullwidth_symbol_fix: bool = Field(
            default=False, description="修复代码块中的全角符号"
        )
        enable_mermaid_fix: bool = Field(
            default=True,
            description="修复常见的 Mermaid 语法错误 (如未加引号的标签)",
        )
        enable_heading_fix: bool = Field(
            default=True,
            description="修复标题中缺失的空格 (#Header -> # Header)",
        )
        enable_table_fix: bool = Field(
            default=True, description="修复表格中缺失的闭合管道符"
        )
        enable_xml_tag_cleanup: bool = Field(
            default=True, description="清理残留的 XML 标签"
        )
        show_status: bool = Field(default=True, description="应用修复时显示状态通知")
        show_debug_log: bool = Field(
            default=False, description="在浏览器控制台打印调试日志 (F12)"
        )
    def __init__(self):
        self.valves = self.Valves()
    def _contains_html(self, content: str) -> bool:
        """Check if content contains HTML tags (to avoid breaking HTML output)"""
        pattern = r"<\s*/?\s*(?:html|head|body|div|span|p|br|hr|ul|ol|li|table|thead|tbody|tfoot|tr|td|th|img|a|b|i|strong|em|code|pre|blockquote|h[1-6]|script|style|form|input|button|label|select|option|iframe|link|meta|title)\b"
        return bool(re.search(pattern, content, re.IGNORECASE))
    async def _emit_status(self, __event_emitter__, applied_fixes: List[str]):
        """Emit status notification"""
        if not self.valves.show_status or not applied_fixes:
            return
        description = "✓ Markdown 已修复"
        if applied_fixes:
            # Translate fix names for status display
            fix_map = {
                "Fix Escape Chars": "转义字符",
                "Normalize Thought Tags": "思维标签",
                "Fix Code Blocks": "代码块",
                "Normalize LaTeX": "LaTeX公式",
                "Fix List Format": "列表格式",
                "Close Code Blocks": "闭合代码块",
                "Fix Full-width Symbols": "全角符号",
                "Fix Mermaid Syntax": "Mermaid语法",
                "Fix Headings": "标题格式",
                "Fix Tables": "表格格式",
                "Cleanup XML Tags": "XML清理",
                "Custom Cleaner": "自定义清理",
            }
            translated_fixes = [fix_map.get(fix, fix) for fix in applied_fixes]
            description += f": {', '.join(translated_fixes)}"
        try:
            await __event_emitter__(
                {
                    "type": "status",
                    "data": {
                        "description": description,
                        "done": True,
                    },
                }
            )
        except Exception as e:
            print(f"Error emitting status: {e}")
    async def _emit_debug_log(
        self,
        __event_emitter__,
        applied_fixes: List[str],
        original: str,
        normalized: str,
    ):
        """Emit debug log to browser console via JS execution"""
    async def _emit_debug_log(
        self, __event_call__, applied_fixes: List[str], original: str, normalized: str
    ):
        """Emit debug log to browser console via JS execution"""
        if not self.valves.show_debug_log or not __event_call__:
            return
        try:
            # Prepare data for JS
            log_data = {
                "fixes": applied_fixes,
                "original": original,
                "normalized": normalized,
            }
            # Construct JS code
            js_code = f"""
                (async function() {{
                    console.group("🛠️ Markdown Normalizer Debug");
                    console.log("Applied Fixes:", {json.dumps(applied_fixes, ensure_ascii=False)});
                    console.log("Original Content:", {json.dumps(original, ensure_ascii=False)});
                    console.log("Normalized Content:", {json.dumps(normalized, ensure_ascii=False)});
                    console.groupEnd();
                }})();
            """
            await __event_call__(
                {
                    "type": "execute",
                    "data": {"code": js_code},
                }
            )
        except Exception as e:
            print(f"Error emitting debug log: {e}")
    async def outlet(
        self,
        body: dict,
        __user__: Optional[dict] = None,
        __event_emitter__=None,
        __event_call__=None,
        __metadata__: Optional[dict] = None,
    ) -> dict:
        """
        Process the response body to normalize Markdown content.
        """
        if "messages" in body and body["messages"]:
            last = body["messages"][-1]
            content = last.get("content", "") or ""
            if last.get("role") == "assistant" and isinstance(content, str):
                # Skip if content looks like HTML to avoid breaking it
                if self._contains_html(content):
                    return body
                # Configure normalizer based on valves
                config = NormalizerConfig(
                    enable_escape_fix=self.valves.enable_escape_fix,
                    enable_thought_tag_fix=self.valves.enable_thought_tag_fix,
                    enable_code_block_fix=self.valves.enable_code_block_fix,
                    enable_latex_fix=self.valves.enable_latex_fix,
                    enable_list_fix=self.valves.enable_list_fix,
                    enable_unclosed_block_fix=self.valves.enable_unclosed_block_fix,
                    enable_fullwidth_symbol_fix=self.valves.enable_fullwidth_symbol_fix,
                    enable_mermaid_fix=self.valves.enable_mermaid_fix,
                    enable_heading_fix=self.valves.enable_heading_fix,
                    enable_table_fix=self.valves.enable_table_fix,
                    enable_xml_tag_cleanup=self.valves.enable_xml_tag_cleanup,
                )
                normalizer = ContentNormalizer(config)
                # Execute normalization
                new_content = normalizer.normalize(content)
                # Update content if changed
                if new_content != content:
                    last["content"] = new_content
                    # Emit status if enabled
                    if __event_emitter__:
                        await self._emit_status(
                            __event_emitter__, normalizer.applied_fixes
                        )
                        await self._emit_debug_log(
                            __event_call__,
                            normalizer.applied_fixes,
                            content,
                            new_content,
                        )
        return body
--- a/plugins/filters/markdown_normalizer/test_markdown_normalizer.py
+++ b/plugins/filters/markdown_normalizer/test_markdown_normalizer.py
@@ -0,0 +1,191 @@
 import unittest
 import sys
 import os
 # Add the current directory to sys.path to import the module
 current_dir = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(current_dir)
 from markdown_normalizer import ContentNormalizer, NormalizerConfig
 class TestMarkdownNormalizer(unittest.TestCase):
    def setUp(self):
        self.config = NormalizerConfig(
            enable_escape_fix=True,
            enable_thought_tag_fix=True,
            enable_code_block_fix=True,
            enable_latex_fix=True,
            enable_list_fix=True,
            enable_unclosed_block_fix=True,
            enable_fullwidth_symbol_fix=True,
            enable_mermaid_fix=True,
            enable_xml_tag_cleanup=True,
        )
        self.normalizer = ContentNormalizer(self.config)
    def test_escape_fix(self):
        input_text = "Line 1\\nLine 2\\tTabbed"
        expected = "Line 1\nLine 2\tTabbed"
        self.assertEqual(self.normalizer.normalize(input_text), expected)
    def test_thought_tag_fix(self):
        # Case 1: Standard tag spacing
        input_text = "Thinking...</thought>Result"
        expected = "Thinking...</thought>\n\nResult"
        self.assertEqual(self.normalizer.normalize(input_text), expected)
        # Case 2: Tag standardization (<think> -> <thought>)
        input_text_deepseek = "<think>Deep thinking...</think>Result"
        expected_deepseek = "<thought>Deep thinking...</thought>\n\nResult"
        self.assertEqual(
            self.normalizer.normalize(input_text_deepseek), expected_deepseek
        )
    def test_code_block_fix(self):
        # Case 1: Indentation
        self.assertEqual(self.normalizer._fix_code_blocks("   ```python"), "```python")
        # Case 2: Prefix (newline before block)
        self.assertEqual(
            self.normalizer._fix_code_blocks("Text```python"), "Text\n```python"
        )
        # Case 3: Suffix (newline after lang)
        self.assertEqual(
            self.normalizer._fix_code_blocks("```python print('hi')"),
            "```python\nprint('hi')",
        )
    def test_latex_fix(self):
        input_text = "Block: \\[ x^2 \\] Inline: \\( E=mc^2 \\)"
        expected = "Block: $$ x^2 $$ Inline: $ E=mc^2 $"
        self.assertEqual(self.normalizer.normalize(input_text), expected)
    def test_list_fix(self):
        input_text = "Item 1. First\nItem 2. Second"  # This is fine
        input_text_bad = "Header1. Item 1"
        expected = "Header\n1. Item 1"
        self.assertEqual(self.normalizer.normalize(input_text_bad), expected)
    def test_unclosed_code_block_fix(self):
        input_text = "```python\nprint('hello')"
        expected = "```python\nprint('hello')\n```"
        self.assertEqual(self.normalizer.normalize(input_text), expected)
    def test_fullwidth_symbol_fix(self):
        input_text = "Outside：Fullwidth ```python\nprint（'hello'）```"
        expected = "Outside：Fullwidth \n```python\nprint('hello')\n```"
        normalized = self.normalizer.normalize(input_text)
        self.assertIn("print('hello')", normalized)
        self.assertIn("Outside：Fullwidth", normalized)
        self.assertNotIn("（", normalized)
        self.assertNotIn("）", normalized)
    def test_mermaid_fix(self):
        # Test Mermaid syntax fix for unquoted labels
        # Note: Regex-based fix handles mixed brackets well (e.g. [] inside ())
        # but cannot perfectly handle same-type nesting (e.g. {} inside {}) without a parser.
        input_text = """
 ```mermaid
 graph TD
    A[Label with (parens)] --> B(Label with [brackets])
    C{Label with [brackets]}
 ```
 """
        expected_snippet = """
 ```mermaid
 graph TD
    A["Label with (parens)"] --> B("Label with [brackets]")
    C{"Label with [brackets]"}
 ```
 """
        normalized = self.normalizer.normalize(input_text)
        self.assertIn('A["Label with (parens)"]', normalized)
        self.assertIn('B("Label with [brackets]")', normalized)
        self.assertIn('C{"Label with [brackets]"}', normalized)
    def test_mermaid_shapes_regression(self):
        # Regression test for "reverse optimization" where ((...)) was broken into ("(...)")
        input_text = """
 ```mermaid
 graph TD
    Start((开始)) --> Input[[输入]]
    Input --> Verify{验证}
    Verify --> End(((结束)))
 ```
 """
        expected_snippet = """
 ```mermaid
 graph TD
    Start(("开始")) --> Input[["输入"]]
    Input --> Verify{"验证"}
    Verify --> End((("结束")))
 ```
 """
        normalized = self.normalizer.normalize(input_text)
        self.assertIn('Start(("开始"))', normalized)
        self.assertIn('Input[["输入"]]', normalized)
        self.assertIn('Verify{"验证"}', normalized)
        self.assertIn('End((("结束")))', normalized)
    def test_xml_cleanup(self):
        input_text = "Some text <antArtifact>hidden</antArtifact> visible"
        expected = "Some text hidden visible"
        self.assertEqual(self.normalizer.normalize(input_text), expected)
    def test_heading_fix(self):
        input_text = "#Heading 1\n##Heading 2\n### Valid Heading"
        expected = "# Heading 1\n## Heading 2\n### Valid Heading"
        self.assertEqual(self.normalizer.normalize(input_text), expected)
    def test_table_fix(self):
        input_text = "| Col 1 | Col 2\n| Val 1 | Val 2"
        expected = "| Col 1 | Col 2|\n| Val 1 | Val 2|"
        self.assertEqual(self.normalizer.normalize(input_text), expected)
    def test_mermaid_subgraph_autoclose(self):
        """Test auto-closing of Mermaid subgraphs"""
        # Case 1: Simple unclosed subgraph
        original = """
 ```mermaid
 graph TD
    subgraph One
        A --> B
 ```
 """
        expected = """
 ```mermaid
 graph TD
    subgraph One
        A --> B
    end
 ```
 """
        # Note: The normalizer might add quotes to A and B if they match the node pattern,
        # but here they are simple IDs. However, our regex is strict about shapes.
        # Simple IDs like A and B are NOT matched by our mermaid_node regex because it requires a shape delimiter.
        # So A and B remain A and B.
        normalized = self.normalizer.normalize(original)
        # We need to be careful about whitespace in comparison
        self.assertIn("end", normalized)
        self.assertEqual(normalized.strip(), expected.strip())
        # Case 2: Nested unclosed subgraphs
        original_nested = """
 ```mermaid
 graph TD
    subgraph Outer
        subgraph Inner
            C --> D
 ```
 """
        normalized_nested = self.normalizer.normalize(original_nested)
        self.assertEqual(normalized_nested.count("end"), 2)
 if __name__ == "__main__":
    unittest.main()