From ef34cc326c9316a0a833473e827e7396a4fe2865 Mon Sep 17 00:00:00 2001 From: fujie Date: Sat, 10 Jan 2026 15:45:20 +0800 Subject: [PATCH] feat: enhance markdown normalizer with mermaid fix and frontend logging --- .github/copilot-instructions.md | 41 +- .../markdown_normalizer.py | 519 +++++++++++++++++ .../markdown_normalizer_cn.py | 544 ++++++++++++++++++ .../test_markdown_normalizer.py | 191 ++++++ 4 files changed, 1294 insertions(+), 1 deletion(-) create mode 100644 plugins/filters/markdown_normalizer/markdown_normalizer.py create mode 100644 plugins/filters/markdown_normalizer/markdown_normalizer_cn.py create mode 100644 plugins/filters/markdown_normalizer/test_markdown_normalizer.py diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index c3bd62e..5ce1210 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -260,7 +260,46 @@ async def _emit_notification( ## 📋 日志规范 (Logging Standard) -- **禁止使用** `print()` 语句 +### 1. 前端控制台调试 (Frontend Console Debugging) - **优先推荐 (Preferred)** + +对于需要实时查看数据流、排查 UI 交互或内容变更的场景,**优先使用**前端控制台日志。这种方式可以直接在浏览器 DevTools (F12) 中查看,无需访问服务端日志。 + +**实现方式**: 通过 `__event_emitter__` 发送 `type: "execute"` 事件执行 JS 代码。 + +```python +import json + +async def _emit_debug_log(self, __event_emitter__, title: str, data: dict): + """在浏览器控制台打印结构化调试日志""" + if not self.valves.show_debug_log or not __event_emitter__: + return + + try: + js_code = f""" + (async function() {{ + console.group("🛠️ {title}"); + console.log({json.dumps(data, ensure_ascii=False)}); + console.groupEnd(); + }})(); + """ + + await __event_emitter__({ + "type": "execute", + "data": {"code": js_code} + }) + except Exception as e: + print(f"Error emitting debug log: {e}") +``` + +**配置要求**: +- 在 `Valves` 中添加 `show_debug_log: bool` 开关,默认关闭。 +- 仅在开关开启时发送日志。 + +### 2. 服务端日志 (Server-side Logging) + +用于记录系统级错误、异常堆栈或无需前端感知的后台任务。 + +- **禁止使用** `print()` 语句 (除非用于简单的脚本调试) - 必须使用 Python 标准库 `logging` ```python diff --git a/plugins/filters/markdown_normalizer/markdown_normalizer.py b/plugins/filters/markdown_normalizer/markdown_normalizer.py new file mode 100644 index 0000000..7d7c932 --- /dev/null +++ b/plugins/filters/markdown_normalizer/markdown_normalizer.py @@ -0,0 +1,519 @@ +""" +title: Markdown Normalizer +author: Fu-Jie +author_url: https://github.com/Fu-Jie +funding_url: https://github.com/Fu-Jie/awesome-openwebui +version: 1.0.0 +description: A production-grade content normalizer filter that fixes common Markdown formatting issues in LLM outputs, such as broken code blocks, LaTeX formulas, and list formatting. +""" + +from pydantic import BaseModel, Field +from typing import Optional, List, Callable +import re +import logging +import logging +import asyncio +import json +from dataclasses import dataclass, field + +# Configure logging +logger = logging.getLogger(__name__) + + +@dataclass +class NormalizerConfig: + """Configuration class for enabling/disabling specific normalization rules""" + + enable_escape_fix: bool = True # Fix excessive escape characters + enable_thought_tag_fix: bool = True # Normalize thought tags + enable_code_block_fix: bool = True # Fix code block formatting + enable_latex_fix: bool = True # Fix LaTeX formula formatting + enable_list_fix: bool = ( + False # Fix list item newlines (default off as it can be aggressive) + ) + enable_unclosed_block_fix: bool = True # Auto-close unclosed code blocks + enable_fullwidth_symbol_fix: bool = False # Fix full-width symbols in code blocks + enable_mermaid_fix: bool = True # Fix common Mermaid syntax errors + enable_heading_fix: bool = ( + True # Fix missing space in headings (#Header -> # Header) + ) + enable_table_fix: bool = True # Fix missing closing pipe in tables + enable_xml_tag_cleanup: bool = True # Cleanup leftover XML tags + + # Custom cleaner functions (for advanced extension) + custom_cleaners: List[Callable[[str], str]] = field(default_factory=list) + + +class ContentNormalizer: + """LLM Output Content Normalizer - Production Grade Implementation""" + + # --- 1. Pre-compiled Regex Patterns (Performance Optimization) --- + _PATTERNS = { + # Code block prefix: if ``` is not at start of line or file + "code_block_prefix": re.compile(r"(? followed by optional whitespace/newlines + "thought_end": re.compile( + r"[ \t]*\n*", re.IGNORECASE + ), + "thought_start": re.compile(r"<(thought|think|thinking)>", re.IGNORECASE), + # LaTeX block: \[ ... \] + "latex_bracket_block": re.compile(r"\\\[(.+?)\\\]", re.DOTALL), + # LaTeX inline: \( ... \) + "latex_paren_inline": re.compile(r"\\\((.+?)\\\)"), + # List item: non-newline + digit + dot + space + "list_item": re.compile(r"([^\n])(\d+\. )"), + # XML artifacts (e.g. Claude's) + "xml_artifacts": re.compile( + r"]*>", re.IGNORECASE + ), + # Mermaid: Match various node shapes and quote unquoted labels + # Fix "reverse optimization": Must precisely match shape delimiters to avoid breaking structure + # Priority: Longer delimiters match first + "mermaid_node": re.compile( + r"(\w+)\s*(?:" + r"(\(\(\()(?![\"])(.*?)(?)(?![\"])(.*?)(?...] Asymmetric + r")" + ), + # Heading: #Heading -> # Heading + "heading_space": re.compile(r"^(#+)([^ \n#])", re.MULTILINE), + # Table: | col1 | col2 -> | col1 | col2 | + "table_pipe": re.compile(r"^(\|.*[^|\n])$", re.MULTILINE), + } + + def __init__(self, config: Optional[NormalizerConfig] = None): + self.config = config or NormalizerConfig() + self.applied_fixes = [] + + def normalize(self, content: str) -> str: + """Main entry point: apply all normalization rules in order""" + self.applied_fixes = [] + if not content: + return content + + original_content = content # Keep a copy for logging + + try: + # 1. Escape character fix (Must be first) + if self.config.enable_escape_fix: + original = content + content = self._fix_escape_characters(content) + if content != original: + self.applied_fixes.append("Fix Escape Chars") + + # 2. Thought tag normalization + if self.config.enable_thought_tag_fix: + original = content + content = self._fix_thought_tags(content) + if content != original: + self.applied_fixes.append("Normalize Thought Tags") + + # 3. Code block formatting fix + if self.config.enable_code_block_fix: + original = content + content = self._fix_code_blocks(content) + if content != original: + self.applied_fixes.append("Fix Code Blocks") + + # 4. LaTeX formula normalization + if self.config.enable_latex_fix: + original = content + content = self._fix_latex_formulas(content) + if content != original: + self.applied_fixes.append("Normalize LaTeX") + + # 5. List formatting fix + if self.config.enable_list_fix: + original = content + content = self._fix_list_formatting(content) + if content != original: + self.applied_fixes.append("Fix List Format") + + # 6. Unclosed code block fix + if self.config.enable_unclosed_block_fix: + original = content + content = self._fix_unclosed_code_blocks(content) + if content != original: + self.applied_fixes.append("Close Code Blocks") + + # 7. Full-width symbol fix (in code blocks only) + if self.config.enable_fullwidth_symbol_fix: + original = content + content = self._fix_fullwidth_symbols_in_code(content) + if content != original: + self.applied_fixes.append("Fix Full-width Symbols") + + # 8. Mermaid syntax fix + if self.config.enable_mermaid_fix: + original = content + content = self._fix_mermaid_syntax(content) + if content != original: + self.applied_fixes.append("Fix Mermaid Syntax") + + # 9. Heading fix + if self.config.enable_heading_fix: + original = content + content = self._fix_headings(content) + if content != original: + self.applied_fixes.append("Fix Headings") + + # 10. Table fix + if self.config.enable_table_fix: + original = content + content = self._fix_tables(content) + if content != original: + self.applied_fixes.append("Fix Tables") + + # 11. XML tag cleanup + if self.config.enable_xml_tag_cleanup: + original = content + content = self._cleanup_xml_tags(content) + if content != original: + self.applied_fixes.append("Cleanup XML Tags") + + # 9. Custom cleaners + for cleaner in self.config.custom_cleaners: + original = content + content = cleaner(content) + if content != original: + self.applied_fixes.append("Custom Cleaner") + + if self.applied_fixes: + logger.info(f"Markdown Normalizer Applied Fixes: {self.applied_fixes}") + logger.debug( + f"--- Original Content ---\n{original_content}\n------------------------" + ) + logger.debug( + f"--- Normalized Content ---\n{content}\n--------------------------" + ) + + return content + + except Exception as e: + # Production safeguard: return original content on error + logger.error(f"Content normalization failed: {e}", exc_info=True) + return content + + def _fix_escape_characters(self, content: str) -> str: + """Fix excessive escape characters""" + content = content.replace("\\r\\n", "\n") + content = content.replace("\\n", "\n") + content = content.replace("\\t", "\t") + content = content.replace("\\\\", "\\") + return content + + def _fix_thought_tags(self, content: str) -> str: + """Normalize thought tags: unify naming and fix spacing""" + # 1. Standardize start tag: , -> + content = self._PATTERNS["thought_start"].sub("", content) + # 2. Standardize end tag and ensure newlines: -> \n\n + return self._PATTERNS["thought_end"].sub("\n\n", content) + + def _fix_code_blocks(self, content: str) -> str: + """Fix code block formatting (prefixes, suffixes, indentation)""" + # Remove indentation before code blocks + content = self._PATTERNS["code_block_indent"].sub(r"\1", content) + # Ensure newline before ``` + content = self._PATTERNS["code_block_prefix"].sub(r"\n\1", content) + # Ensure newline after ```lang + content = self._PATTERNS["code_block_suffix"].sub(r"\1\n\2", content) + return content + + def _fix_latex_formulas(self, content: str) -> str: + """Normalize LaTeX formulas: \[ -> $$ (block), \( -> $ (inline)""" + content = self._PATTERNS["latex_bracket_block"].sub(r"$$\1$$", content) + content = self._PATTERNS["latex_paren_inline"].sub(r"$\1$", content) + return content + + def _fix_list_formatting(self, content: str) -> str: + """Fix missing newlines in lists (e.g., 'text1. item' -> 'text\\n1. item')""" + return self._PATTERNS["list_item"].sub(r"\1\n\2", content) + + def _fix_unclosed_code_blocks(self, content: str) -> str: + """Auto-close unclosed code blocks""" + if content.count("```") % 2 != 0: + content += "\n```" + return content + + def _fix_fullwidth_symbols_in_code(self, content: str) -> str: + """Convert full-width symbols to half-width inside code blocks""" + FULLWIDTH_MAP = { + ",": ",", + "。": ".", + "(": "(", + ")": ")", + "【": "[", + "】": "]", + ";": ";", + ":": ":", + "?": "?", + "!": "!", + '"': '"', + '"': '"', + """: "'", """: "'", + } + + parts = content.split("```") + # Code block content is at odd indices: 1, 3, 5... + for i in range(1, len(parts), 2): + for full, half in FULLWIDTH_MAP.items(): + parts[i] = parts[i].replace(full, half) + + return "```".join(parts) + + def _fix_mermaid_syntax(self, content: str) -> str: + """Fix common Mermaid syntax errors while preserving node shapes""" + + def replacer(match): + # Group 1 is ID + id_str = match.group(1) + + # Find matching shape group + # Groups start at index 2, each shape has 3 groups (Open, Content, Close) + # We iterate to find the non-None one + groups = match.groups() + for i in range(1, len(groups), 3): + if groups[i] is not None: + open_char = groups[i] + content = groups[i + 1] + close_char = groups[i + 2] + + # Escape quotes in content + content = content.replace('"', '\\"') + + return f'{id_str}{open_char}"{content}"{close_char}' + + return match.group(0) + + parts = content.split("```") + for i in range(1, len(parts), 2): + # Check if it's a mermaid block + lang_line = parts[i].split("\n", 1)[0].strip().lower() + if "mermaid" in lang_line: + # Apply the comprehensive regex fix + parts[i] = self._PATTERNS["mermaid_node"].sub(replacer, parts[i]) + + # Auto-close subgraphs + subgraph_count = len( + re.findall(r"\bsubgraph\b", parts[i], re.IGNORECASE) + ) + end_count = len(re.findall(r"\bend\b", parts[i], re.IGNORECASE)) + + if subgraph_count > end_count: + missing_ends = subgraph_count - end_count + parts[i] = parts[i].rstrip() + ("\n end" * missing_ends) + "\n" + + return "```".join(parts) + + def _fix_headings(self, content: str) -> str: + """Fix missing space in headings: #Heading -> # Heading""" + # We only fix if it's not inside a code block. + # But splitting by code block is expensive. + # Given headings usually don't appear inside code blocks without space in valid code (except comments), + # we might risk false positives in comments like `#TODO`. + # To be safe, let's split by code blocks. + + parts = content.split("```") + for i in range(0, len(parts), 2): # Even indices are markdown text + parts[i] = self._PATTERNS["heading_space"].sub(r"\1 \2", parts[i]) + return "```".join(parts) + + def _fix_tables(self, content: str) -> str: + """Fix tables missing closing pipe""" + parts = content.split("```") + for i in range(0, len(parts), 2): + parts[i] = self._PATTERNS["table_pipe"].sub(r"\1|", parts[i]) + return "```".join(parts) + + def _cleanup_xml_tags(self, content: str) -> str: + """Remove leftover XML tags""" + return self._PATTERNS["xml_artifacts"].sub("", content) + + +class Filter: + class Valves(BaseModel): + priority: int = Field( + default=50, + description="Priority level. Higher runs later (recommended to run after other filters).", + ) + enable_escape_fix: bool = Field( + default=True, description="Fix excessive escape characters (\\n, \\t, etc.)" + ) + enable_thought_tag_fix: bool = Field( + default=True, description="Normalize tags" + ) + enable_code_block_fix: bool = Field( + default=True, + description="Fix code block formatting (indentation, newlines)", + ) + enable_latex_fix: bool = Field( + default=True, description="Normalize LaTeX formulas (\\[ -> $$, \\( -> $)" + ) + enable_list_fix: bool = Field( + default=False, description="Fix list item newlines (Experimental)" + ) + enable_unclosed_block_fix: bool = Field( + default=True, description="Auto-close unclosed code blocks" + ) + enable_fullwidth_symbol_fix: bool = Field( + default=False, description="Fix full-width symbols in code blocks" + ) + enable_mermaid_fix: bool = Field( + default=True, + description="Fix common Mermaid syntax errors (e.g. unquoted labels)", + ) + enable_heading_fix: bool = Field( + default=True, + description="Fix missing space in headings (#Header -> # Header)", + ) + enable_table_fix: bool = Field( + default=True, description="Fix missing closing pipe in tables" + ) + enable_xml_tag_cleanup: bool = Field( + default=True, description="Cleanup leftover XML tags" + ) + show_status: bool = Field( + default=True, description="Show status notification when fixes are applied" + ) + show_debug_log: bool = Field( + default=False, description="Print debug logs to browser console (F12)" + ) + + def __init__(self): + self.valves = self.Valves() + + def _contains_html(self, content: str) -> bool: + """Check if content contains HTML tags (to avoid breaking HTML output)""" + pattern = r"<\s*/?\s*(?:html|head|body|div|span|p|br|hr|ul|ol|li|table|thead|tbody|tfoot|tr|td|th|img|a|b|i|strong|em|code|pre|blockquote|h[1-6]|script|style|form|input|button|label|select|option|iframe|link|meta|title)\b" + return bool(re.search(pattern, content, re.IGNORECASE)) + + async def _emit_status(self, __event_emitter__, applied_fixes: List[str]): + """Emit status notification""" + if not self.valves.show_status or not applied_fixes: + return + + description = "✓ Markdown Normalized" + if applied_fixes: + description += f": {', '.join(applied_fixes)}" + + try: + await __event_emitter__( + { + "type": "status", + "data": { + "description": description, + "done": True, + }, + } + ) + except Exception as e: + print(f"Error emitting status: {e}") + + async def _emit_debug_log( + self, __event_call__, applied_fixes: List[str], original: str, normalized: str + ): + """Emit debug log to browser console via JS execution""" + if not self.valves.show_debug_log or not __event_call__: + return + + try: + # Prepare data for JS + log_data = { + "fixes": applied_fixes, + "original": original, + "normalized": normalized, + } + + # Construct JS code + js_code = f""" + (async function() {{ + console.group("🛠️ Markdown Normalizer Debug"); + console.log("Applied Fixes:", {json.dumps(applied_fixes, ensure_ascii=False)}); + console.log("Original Content:", {json.dumps(original, ensure_ascii=False)}); + console.log("Normalized Content:", {json.dumps(normalized, ensure_ascii=False)}); + console.groupEnd(); + }})(); + """ + + await __event_call__( + { + "type": "execute", + "data": {"code": js_code}, + } + ) + except Exception as e: + print(f"Error emitting debug log: {e}") + + async def outlet( + self, + body: dict, + __user__: Optional[dict] = None, + __event_emitter__=None, + __event_call__=None, + __metadata__: Optional[dict] = None, + ) -> dict: + """ + Process the response body to normalize Markdown content. + """ + if "messages" in body and body["messages"]: + last = body["messages"][-1] + content = last.get("content", "") or "" + + if last.get("role") == "assistant" and isinstance(content, str): + # Skip if content looks like HTML to avoid breaking it + if self._contains_html(content): + return body + + # Configure normalizer based on valves + config = NormalizerConfig( + enable_escape_fix=self.valves.enable_escape_fix, + enable_thought_tag_fix=self.valves.enable_thought_tag_fix, + enable_code_block_fix=self.valves.enable_code_block_fix, + enable_latex_fix=self.valves.enable_latex_fix, + enable_list_fix=self.valves.enable_list_fix, + enable_unclosed_block_fix=self.valves.enable_unclosed_block_fix, + enable_fullwidth_symbol_fix=self.valves.enable_fullwidth_symbol_fix, + enable_mermaid_fix=self.valves.enable_mermaid_fix, + enable_heading_fix=self.valves.enable_heading_fix, + enable_table_fix=self.valves.enable_table_fix, + enable_xml_tag_cleanup=self.valves.enable_xml_tag_cleanup, + ) + + normalizer = ContentNormalizer(config) + + # Execute normalization + new_content = normalizer.normalize(content) + + # Update content if changed + if new_content != content: + last["content"] = new_content + + # Emit status if enabled + if __event_emitter__: + await self._emit_status( + __event_emitter__, normalizer.applied_fixes + ) + await self._emit_debug_log( + __event_call__, + normalizer.applied_fixes, + content, + new_content, + ) + + return body diff --git a/plugins/filters/markdown_normalizer/markdown_normalizer_cn.py b/plugins/filters/markdown_normalizer/markdown_normalizer_cn.py new file mode 100644 index 0000000..6adaf71 --- /dev/null +++ b/plugins/filters/markdown_normalizer/markdown_normalizer_cn.py @@ -0,0 +1,544 @@ +""" +title: Markdown 格式修复器 (Markdown Normalizer) +author: Fu-Jie +author_url: https://github.com/Fu-Jie +funding_url: https://github.com/Fu-Jie/awesome-openwebui +version: 1.0.0 +description: 生产级内容规范化过滤器,修复 LLM 输出中常见的 Markdown 格式问题,如损坏的代码块、LaTeX 公式、Mermaid 图表和列表格式。 +""" + +from pydantic import BaseModel, Field +from typing import Optional, List, Callable +import re +import logging +import asyncio +import json +from dataclasses import dataclass, field + +# Configure logging +logger = logging.getLogger(__name__) + + +@dataclass +class NormalizerConfig: + """配置类,用于启用/禁用特定的规范化规则""" + + enable_escape_fix: bool = True # 修复过度的转义字符 + enable_thought_tag_fix: bool = True # 规范化思维链标签 + enable_code_block_fix: bool = True # 修复代码块格式 + enable_latex_fix: bool = True # 修复 LaTeX 公式格式 + enable_list_fix: bool = False # 修复列表项换行 (默认关闭,因为可能过于激进) + enable_unclosed_block_fix: bool = True # 自动闭合未闭合的代码块 + enable_fullwidth_symbol_fix: bool = False # 修复代码块中的全角符号 + enable_mermaid_fix: bool = True # 修复常见的 Mermaid 语法错误 + enable_heading_fix: bool = True # 修复标题中缺失的空格 (#Header -> # Header) + enable_table_fix: bool = True # 修复表格中缺失的闭合管道符 + enable_xml_tag_cleanup: bool = True # 清理残留的 XML 标签 + + # 自定义清理函数 (用于高级扩展) + custom_cleaners: List[Callable[[str], str]] = field(default_factory=list) + + +class ContentNormalizer: + """LLM Output Content Normalizer - Production Grade Implementation""" + + # --- 1. Pre-compiled Regex Patterns (Performance Optimization) --- + _PATTERNS = { + # Code block prefix: if ``` is not at start of line or file + "code_block_prefix": re.compile(r"(? followed by optional whitespace/newlines + "thought_end": re.compile( + r"[ \t]*\n*", re.IGNORECASE + ), + "thought_start": re.compile(r"<(thought|think|thinking)>", re.IGNORECASE), + # LaTeX block: \[ ... \] + "latex_bracket_block": re.compile(r"\\\[(.+?)\\\]", re.DOTALL), + # LaTeX inline: \( ... \) + "latex_paren_inline": re.compile(r"\\\((.+?)\\\)"), + # List item: non-newline + digit + dot + space + "list_item": re.compile(r"([^\n])(\d+\. )"), + # XML artifacts (e.g. Claude's) + "xml_artifacts": re.compile( + r"]*>", re.IGNORECASE + ), + # Mermaid: 匹配各种形状的节点并为未加引号的标签添加引号 + # 修复"反向优化"问题:必须精确匹配各种形状的定界符,避免破坏形状结构 + # 优先级:长定界符优先匹配 + "mermaid_node": re.compile( + r"(\w+)\s*(?:" + r"(\(\(\()(?![\"])(.*?)(?)(?![\"])(.*?)(?...] Asymmetric + r")" + ), + # Heading: #Heading -> # Heading + "heading_space": re.compile(r"^(#+)([^ \n#])", re.MULTILINE), + # Table: | col1 | col2 -> | col1 | col2 | + "table_pipe": re.compile(r"^(\|.*[^|\n])$", re.MULTILINE), + } + + def __init__(self, config: Optional[NormalizerConfig] = None): + self.config = config or NormalizerConfig() + self.applied_fixes = [] + + def normalize(self, content: str) -> str: + """Main entry point: apply all normalization rules in order""" + self.applied_fixes = [] + if not content: + return content + + original_content = content # Keep a copy for logging + + try: + # 1. Escape character fix (Must be first) + if self.config.enable_escape_fix: + original = content + content = self._fix_escape_characters(content) + if content != original: + self.applied_fixes.append("Fix Escape Chars") + + # 2. Thought tag normalization + if self.config.enable_thought_tag_fix: + original = content + content = self._fix_thought_tags(content) + if content != original: + self.applied_fixes.append("Normalize Thought Tags") + + # 3. Code block formatting fix + if self.config.enable_code_block_fix: + original = content + content = self._fix_code_blocks(content) + if content != original: + self.applied_fixes.append("Fix Code Blocks") + + # 4. LaTeX formula normalization + if self.config.enable_latex_fix: + original = content + content = self._fix_latex_formulas(content) + if content != original: + self.applied_fixes.append("Normalize LaTeX") + + # 5. List formatting fix + if self.config.enable_list_fix: + original = content + content = self._fix_list_formatting(content) + if content != original: + self.applied_fixes.append("Fix List Format") + + # 6. Unclosed code block fix + if self.config.enable_unclosed_block_fix: + original = content + content = self._fix_unclosed_code_blocks(content) + if content != original: + self.applied_fixes.append("Close Code Blocks") + + # 7. Full-width symbol fix (in code blocks only) + if self.config.enable_fullwidth_symbol_fix: + original = content + content = self._fix_fullwidth_symbols_in_code(content) + if content != original: + self.applied_fixes.append("Fix Full-width Symbols") + + # 8. Mermaid syntax fix + if self.config.enable_mermaid_fix: + original = content + content = self._fix_mermaid_syntax(content) + if content != original: + self.applied_fixes.append("Fix Mermaid Syntax") + + # 9. Heading fix + if self.config.enable_heading_fix: + original = content + content = self._fix_headings(content) + if content != original: + self.applied_fixes.append("Fix Headings") + + # 10. Table fix + if self.config.enable_table_fix: + original = content + content = self._fix_tables(content) + if content != original: + self.applied_fixes.append("Fix Tables") + + # 11. XML tag cleanup + if self.config.enable_xml_tag_cleanup: + original = content + content = self._cleanup_xml_tags(content) + if content != original: + self.applied_fixes.append("Cleanup XML Tags") + + # 9. Custom cleaners + for cleaner in self.config.custom_cleaners: + original = content + content = cleaner(content) + if content != original: + self.applied_fixes.append("Custom Cleaner") + + if self.applied_fixes: + print(f"[Markdown Normalizer] Applied fixes: {self.applied_fixes}") + print( + f"[Markdown Normalizer] --- Original Content ---\n{original_content}\n------------------------" + ) + print( + f"[Markdown Normalizer] --- Normalized Content ---\n{content}\n--------------------------" + ) + + return content + + except Exception as e: + # Production safeguard: return original content on error + logger.error(f"Content normalization failed: {e}", exc_info=True) + return content + + def _fix_escape_characters(self, content: str) -> str: + """Fix excessive escape characters""" + content = content.replace("\\r\\n", "\n") + content = content.replace("\\n", "\n") + content = content.replace("\\t", "\t") + content = content.replace("\\\\", "\\") + return content + + def _fix_thought_tags(self, content: str) -> str: + """Normalize thought tags: unify naming and fix spacing""" + # 1. Standardize start tag: , -> + content = self._PATTERNS["thought_start"].sub("", content) + # 2. Standardize end tag and ensure newlines: -> \n\n + return self._PATTERNS["thought_end"].sub("\n\n", content) + + def _fix_code_blocks(self, content: str) -> str: + """Fix code block formatting (prefixes, suffixes, indentation)""" + # Remove indentation before code blocks + content = self._PATTERNS["code_block_indent"].sub(r"\1", content) + # Ensure newline before ``` + content = self._PATTERNS["code_block_prefix"].sub(r"\n\1", content) + # Ensure newline after ```lang + content = self._PATTERNS["code_block_suffix"].sub(r"\1\n\2", content) + return content + + def _fix_latex_formulas(self, content: str) -> str: + """Normalize LaTeX formulas: \[ -> $$ (block), \( -> $ (inline)""" + content = self._PATTERNS["latex_bracket_block"].sub(r"$$\1$$", content) + content = self._PATTERNS["latex_paren_inline"].sub(r"$\1$", content) + return content + + def _fix_list_formatting(self, content: str) -> str: + """Fix missing newlines in lists (e.g., 'text1. item' -> 'text\\n1. item')""" + return self._PATTERNS["list_item"].sub(r"\1\n\2", content) + + def _fix_unclosed_code_blocks(self, content: str) -> str: + """Auto-close unclosed code blocks""" + if content.count("```") % 2 != 0: + content += "\n```" + return content + + def _fix_fullwidth_symbols_in_code(self, content: str) -> str: + """Convert full-width symbols to half-width inside code blocks""" + FULLWIDTH_MAP = { + ",": ",", + "。": ".", + "(": "(", + ")": ")", + "【": "[", + "】": "]", + ";": ";", + ":": ":", + "?": "?", + "!": "!", + '"': '"', + '"': '"', + """: "'", """: "'", + } + + parts = content.split("```") + # Code block content is at odd indices: 1, 3, 5... + for i in range(1, len(parts), 2): + for full, half in FULLWIDTH_MAP.items(): + parts[i] = parts[i].replace(full, half) + + return "```".join(parts) + + def _fix_mermaid_syntax(self, content: str) -> str: + """修复常见的 Mermaid 语法错误,同时保留节点形状""" + + def replacer(match): + # Group 1 是 ID + id_str = match.group(1) + + # 查找匹配的形状组 + # 组从索引 2 开始,每个形状有 3 个组 (Open, Content, Close) + # 我们遍历找到非 None 的那一组 + groups = match.groups() + for i in range(1, len(groups), 3): + if groups[i] is not None: + open_char = groups[i] + content = groups[i + 1] + close_char = groups[i + 2] + + # 如果内容包含引号,进行转义 + content = content.replace('"', '\\"') + + return f'{id_str}{open_char}"{content}"{close_char}' + + return match.group(0) + + parts = content.split("```") + for i in range(1, len(parts), 2): + # Check if it's a mermaid block + lang_line = parts[i].split("\n", 1)[0].strip().lower() + if "mermaid" in lang_line: + # Apply the comprehensive regex fix + parts[i] = self._PATTERNS["mermaid_node"].sub(replacer, parts[i]) + + # Auto-close subgraphs + # Count 'subgraph' and 'end' (case-insensitive) + # We use a simple regex to avoid matching words inside labels (though labels are now quoted, so it's safer) + # But for simplicity and speed, we just count occurrences in the whole block. + # A more robust way would be to strip quoted strings first, but that's expensive. + # Given we just quoted everything, let's try to count keywords outside quotes? + # Actually, since we just normalized nodes, most text is in quotes. + # Let's just do a simple count. It's a heuristic fix. + subgraph_count = len( + re.findall(r"\bsubgraph\b", parts[i], re.IGNORECASE) + ) + end_count = len(re.findall(r"\bend\b", parts[i], re.IGNORECASE)) + + if subgraph_count > end_count: + missing_ends = subgraph_count - end_count + parts[i] = parts[i].rstrip() + ("\n end" * missing_ends) + "\n" + + return "```".join(parts) + + def _fix_headings(self, content: str) -> str: + """Fix missing space in headings: #Heading -> # Heading""" + # We only fix if it's not inside a code block. + # But splitting by code block is expensive. + # Given headings usually don't appear inside code blocks without space in valid code (except comments), + # we might risk false positives in comments like `#TODO`. + # To be safe, let's split by code blocks. + + parts = content.split("```") + for i in range(0, len(parts), 2): # Even indices are markdown text + parts[i] = self._PATTERNS["heading_space"].sub(r"\1 \2", parts[i]) + return "```".join(parts) + + def _fix_tables(self, content: str) -> str: + """Fix tables missing closing pipe""" + parts = content.split("```") + for i in range(0, len(parts), 2): + parts[i] = self._PATTERNS["table_pipe"].sub(r"\1|", parts[i]) + return "```".join(parts) + + def _cleanup_xml_tags(self, content: str) -> str: + """Remove leftover XML tags""" + return self._PATTERNS["xml_artifacts"].sub("", content) + + +class Filter: + class Valves(BaseModel): + priority: int = Field( + default=50, + description="优先级。数值越高运行越晚 (建议在其他过滤器之后运行)。", + ) + enable_escape_fix: bool = Field( + default=True, description="修复过度的转义字符 (\\n, \\t 等)" + ) + enable_thought_tag_fix: bool = Field( + default=True, description="规范化思维链标签 ( -> )" + ) + enable_code_block_fix: bool = Field( + default=True, + description="修复代码块格式 (缩进、换行)", + ) + enable_latex_fix: bool = Field( + default=True, description="规范化 LaTeX 公式 (\\[ -> $$, \\( -> $)" + ) + enable_list_fix: bool = Field( + default=False, description="修复列表项换行 (实验性)" + ) + enable_unclosed_block_fix: bool = Field( + default=True, description="自动闭合未闭合的代码块" + ) + enable_fullwidth_symbol_fix: bool = Field( + default=False, description="修复代码块中的全角符号" + ) + enable_mermaid_fix: bool = Field( + default=True, + description="修复常见的 Mermaid 语法错误 (如未加引号的标签)", + ) + enable_heading_fix: bool = Field( + default=True, + description="修复标题中缺失的空格 (#Header -> # Header)", + ) + enable_table_fix: bool = Field( + default=True, description="修复表格中缺失的闭合管道符" + ) + enable_xml_tag_cleanup: bool = Field( + default=True, description="清理残留的 XML 标签" + ) + show_status: bool = Field(default=True, description="应用修复时显示状态通知") + show_debug_log: bool = Field( + default=False, description="在浏览器控制台打印调试日志 (F12)" + ) + + def __init__(self): + self.valves = self.Valves() + + def _contains_html(self, content: str) -> bool: + """Check if content contains HTML tags (to avoid breaking HTML output)""" + pattern = r"<\s*/?\s*(?:html|head|body|div|span|p|br|hr|ul|ol|li|table|thead|tbody|tfoot|tr|td|th|img|a|b|i|strong|em|code|pre|blockquote|h[1-6]|script|style|form|input|button|label|select|option|iframe|link|meta|title)\b" + return bool(re.search(pattern, content, re.IGNORECASE)) + + async def _emit_status(self, __event_emitter__, applied_fixes: List[str]): + """Emit status notification""" + if not self.valves.show_status or not applied_fixes: + return + + description = "✓ Markdown 已修复" + if applied_fixes: + # Translate fix names for status display + fix_map = { + "Fix Escape Chars": "转义字符", + "Normalize Thought Tags": "思维标签", + "Fix Code Blocks": "代码块", + "Normalize LaTeX": "LaTeX公式", + "Fix List Format": "列表格式", + "Close Code Blocks": "闭合代码块", + "Fix Full-width Symbols": "全角符号", + "Fix Mermaid Syntax": "Mermaid语法", + "Fix Headings": "标题格式", + "Fix Tables": "表格格式", + "Cleanup XML Tags": "XML清理", + "Custom Cleaner": "自定义清理", + } + translated_fixes = [fix_map.get(fix, fix) for fix in applied_fixes] + description += f": {', '.join(translated_fixes)}" + + try: + await __event_emitter__( + { + "type": "status", + "data": { + "description": description, + "done": True, + }, + } + ) + except Exception as e: + print(f"Error emitting status: {e}") + + async def _emit_debug_log( + self, + __event_emitter__, + applied_fixes: List[str], + original: str, + normalized: str, + ): + """Emit debug log to browser console via JS execution""" + + async def _emit_debug_log( + self, __event_call__, applied_fixes: List[str], original: str, normalized: str + ): + """Emit debug log to browser console via JS execution""" + if not self.valves.show_debug_log or not __event_call__: + return + + try: + # Prepare data for JS + log_data = { + "fixes": applied_fixes, + "original": original, + "normalized": normalized, + } + + # Construct JS code + js_code = f""" + (async function() {{ + console.group("🛠️ Markdown Normalizer Debug"); + console.log("Applied Fixes:", {json.dumps(applied_fixes, ensure_ascii=False)}); + console.log("Original Content:", {json.dumps(original, ensure_ascii=False)}); + console.log("Normalized Content:", {json.dumps(normalized, ensure_ascii=False)}); + console.groupEnd(); + }})(); + """ + await __event_call__( + { + "type": "execute", + "data": {"code": js_code}, + } + ) + + except Exception as e: + print(f"Error emitting debug log: {e}") + + async def outlet( + self, + body: dict, + __user__: Optional[dict] = None, + __event_emitter__=None, + __event_call__=None, + __metadata__: Optional[dict] = None, + ) -> dict: + """ + Process the response body to normalize Markdown content. + """ + if "messages" in body and body["messages"]: + last = body["messages"][-1] + content = last.get("content", "") or "" + + if last.get("role") == "assistant" and isinstance(content, str): + # Skip if content looks like HTML to avoid breaking it + if self._contains_html(content): + return body + + # Configure normalizer based on valves + config = NormalizerConfig( + enable_escape_fix=self.valves.enable_escape_fix, + enable_thought_tag_fix=self.valves.enable_thought_tag_fix, + enable_code_block_fix=self.valves.enable_code_block_fix, + enable_latex_fix=self.valves.enable_latex_fix, + enable_list_fix=self.valves.enable_list_fix, + enable_unclosed_block_fix=self.valves.enable_unclosed_block_fix, + enable_fullwidth_symbol_fix=self.valves.enable_fullwidth_symbol_fix, + enable_mermaid_fix=self.valves.enable_mermaid_fix, + enable_heading_fix=self.valves.enable_heading_fix, + enable_table_fix=self.valves.enable_table_fix, + enable_xml_tag_cleanup=self.valves.enable_xml_tag_cleanup, + ) + + normalizer = ContentNormalizer(config) + + # Execute normalization + new_content = normalizer.normalize(content) + + # Update content if changed + if new_content != content: + last["content"] = new_content + + # Emit status if enabled + if __event_emitter__: + await self._emit_status( + __event_emitter__, normalizer.applied_fixes + ) + await self._emit_debug_log( + __event_call__, + normalizer.applied_fixes, + content, + new_content, + ) + + return body diff --git a/plugins/filters/markdown_normalizer/test_markdown_normalizer.py b/plugins/filters/markdown_normalizer/test_markdown_normalizer.py new file mode 100644 index 0000000..4e9a7f9 --- /dev/null +++ b/plugins/filters/markdown_normalizer/test_markdown_normalizer.py @@ -0,0 +1,191 @@ +import unittest +import sys +import os + +# Add the current directory to sys.path to import the module +current_dir = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(current_dir) + +from markdown_normalizer import ContentNormalizer, NormalizerConfig + + +class TestMarkdownNormalizer(unittest.TestCase): + def setUp(self): + self.config = NormalizerConfig( + enable_escape_fix=True, + enable_thought_tag_fix=True, + enable_code_block_fix=True, + enable_latex_fix=True, + enable_list_fix=True, + enable_unclosed_block_fix=True, + enable_fullwidth_symbol_fix=True, + enable_mermaid_fix=True, + enable_xml_tag_cleanup=True, + ) + self.normalizer = ContentNormalizer(self.config) + + def test_escape_fix(self): + input_text = "Line 1\\nLine 2\\tTabbed" + expected = "Line 1\nLine 2\tTabbed" + self.assertEqual(self.normalizer.normalize(input_text), expected) + + def test_thought_tag_fix(self): + # Case 1: Standard tag spacing + input_text = "Thinking...Result" + expected = "Thinking...\n\nResult" + self.assertEqual(self.normalizer.normalize(input_text), expected) + + # Case 2: Tag standardization ( -> ) + input_text_deepseek = "Deep thinking...Result" + expected_deepseek = "Deep thinking...\n\nResult" + self.assertEqual( + self.normalizer.normalize(input_text_deepseek), expected_deepseek + ) + + def test_code_block_fix(self): + # Case 1: Indentation + self.assertEqual(self.normalizer._fix_code_blocks(" ```python"), "```python") + + # Case 2: Prefix (newline before block) + self.assertEqual( + self.normalizer._fix_code_blocks("Text```python"), "Text\n```python" + ) + + # Case 3: Suffix (newline after lang) + self.assertEqual( + self.normalizer._fix_code_blocks("```python print('hi')"), + "```python\nprint('hi')", + ) + + def test_latex_fix(self): + input_text = "Block: \\[ x^2 \\] Inline: \\( E=mc^2 \\)" + expected = "Block: $$ x^2 $$ Inline: $ E=mc^2 $" + self.assertEqual(self.normalizer.normalize(input_text), expected) + + def test_list_fix(self): + input_text = "Item 1. First\nItem 2. Second" # This is fine + input_text_bad = "Header1. Item 1" + expected = "Header\n1. Item 1" + self.assertEqual(self.normalizer.normalize(input_text_bad), expected) + + def test_unclosed_code_block_fix(self): + input_text = "```python\nprint('hello')" + expected = "```python\nprint('hello')\n```" + self.assertEqual(self.normalizer.normalize(input_text), expected) + + def test_fullwidth_symbol_fix(self): + input_text = "Outside:Fullwidth ```python\nprint('hello')```" + expected = "Outside:Fullwidth \n```python\nprint('hello')\n```" + + normalized = self.normalizer.normalize(input_text) + self.assertIn("print('hello')", normalized) + self.assertIn("Outside:Fullwidth", normalized) + self.assertNotIn("(", normalized) + self.assertNotIn(")", normalized) + + def test_mermaid_fix(self): + # Test Mermaid syntax fix for unquoted labels + # Note: Regex-based fix handles mixed brackets well (e.g. [] inside ()) + # but cannot perfectly handle same-type nesting (e.g. {} inside {}) without a parser. + input_text = """ +```mermaid +graph TD + A[Label with (parens)] --> B(Label with [brackets]) + C{Label with [brackets]} +``` +""" + expected_snippet = """ +```mermaid +graph TD + A["Label with (parens)"] --> B("Label with [brackets]") + C{"Label with [brackets]"} +``` +""" + normalized = self.normalizer.normalize(input_text) + + self.assertIn('A["Label with (parens)"]', normalized) + self.assertIn('B("Label with [brackets]")', normalized) + self.assertIn('C{"Label with [brackets]"}', normalized) + + def test_mermaid_shapes_regression(self): + # Regression test for "reverse optimization" where ((...)) was broken into ("(...)") + input_text = """ +```mermaid +graph TD + Start((开始)) --> Input[[输入]] + Input --> Verify{验证} + Verify --> End(((结束))) +``` +""" + expected_snippet = """ +```mermaid +graph TD + Start(("开始")) --> Input[["输入"]] + Input --> Verify{"验证"} + Verify --> End((("结束"))) +``` +""" + normalized = self.normalizer.normalize(input_text) + self.assertIn('Start(("开始"))', normalized) + self.assertIn('Input[["输入"]]', normalized) + self.assertIn('Verify{"验证"}', normalized) + self.assertIn('End((("结束")))', normalized) + + def test_xml_cleanup(self): + input_text = "Some text hidden visible" + expected = "Some text hidden visible" + self.assertEqual(self.normalizer.normalize(input_text), expected) + + def test_heading_fix(self): + input_text = "#Heading 1\n##Heading 2\n### Valid Heading" + expected = "# Heading 1\n## Heading 2\n### Valid Heading" + self.assertEqual(self.normalizer.normalize(input_text), expected) + + def test_table_fix(self): + input_text = "| Col 1 | Col 2\n| Val 1 | Val 2" + expected = "| Col 1 | Col 2|\n| Val 1 | Val 2|" + self.assertEqual(self.normalizer.normalize(input_text), expected) + + def test_mermaid_subgraph_autoclose(self): + """Test auto-closing of Mermaid subgraphs""" + # Case 1: Simple unclosed subgraph + original = """ +```mermaid +graph TD + subgraph One + A --> B +``` +""" + expected = """ +```mermaid +graph TD + subgraph One + A --> B + end +``` +""" + # Note: The normalizer might add quotes to A and B if they match the node pattern, + # but here they are simple IDs. However, our regex is strict about shapes. + # Simple IDs like A and B are NOT matched by our mermaid_node regex because it requires a shape delimiter. + # So A and B remain A and B. + + normalized = self.normalizer.normalize(original) + # We need to be careful about whitespace in comparison + self.assertIn("end", normalized) + self.assertEqual(normalized.strip(), expected.strip()) + + # Case 2: Nested unclosed subgraphs + original_nested = """ +```mermaid +graph TD + subgraph Outer + subgraph Inner + C --> D +``` +""" + normalized_nested = self.normalizer.normalize(original_nested) + self.assertEqual(normalized_nested.count("end"), 2) + + +if __name__ == "__main__": + unittest.main()