feat: enhance markdown normalizer with mermaid fix and frontend logging

This commit is contained in:
fujie
2026-01-10 15:45:20 +08:00
parent 5fa56ba88d
commit ef34cc326c
4 changed files with 1294 additions and 1 deletions

View File

@@ -260,7 +260,46 @@ async def _emit_notification(
## 📋 日志规范 (Logging Standard) ## 📋 日志规范 (Logging Standard)
- **禁止使用** `print()` 语句 ### 1. 前端控制台调试 (Frontend Console Debugging) - **优先推荐 (Preferred)**
对于需要实时查看数据流、排查 UI 交互或内容变更的场景,**优先使用**前端控制台日志。这种方式可以直接在浏览器 DevTools (F12) 中查看,无需访问服务端日志。
**实现方式**: 通过 `__event_emitter__` 发送 `type: "execute"` 事件执行 JS 代码。
```python
import json
async def _emit_debug_log(self, __event_emitter__, title: str, data: dict):
"""在浏览器控制台打印结构化调试日志"""
if not self.valves.show_debug_log or not __event_emitter__:
return
try:
js_code = f"""
(async function() {{
console.group("🛠️ {title}");
console.log({json.dumps(data, ensure_ascii=False)});
console.groupEnd();
}})();
"""
await __event_emitter__({
"type": "execute",
"data": {"code": js_code}
})
except Exception as e:
print(f"Error emitting debug log: {e}")
```
**配置要求**:
-`Valves` 中添加 `show_debug_log: bool` 开关,默认关闭。
- 仅在开关开启时发送日志。
### 2. 服务端日志 (Server-side Logging)
用于记录系统级错误、异常堆栈或无需前端感知的后台任务。
- **禁止使用** `print()` 语句 (除非用于简单的脚本调试)
- 必须使用 Python 标准库 `logging` - 必须使用 Python 标准库 `logging`
```python ```python

View File

@@ -0,0 +1,519 @@
"""
title: Markdown Normalizer
author: Fu-Jie
author_url: https://github.com/Fu-Jie
funding_url: https://github.com/Fu-Jie/awesome-openwebui
version: 1.0.0
description: A production-grade content normalizer filter that fixes common Markdown formatting issues in LLM outputs, such as broken code blocks, LaTeX formulas, and list formatting.
"""
from pydantic import BaseModel, Field
from typing import Optional, List, Callable
import re
import logging
import logging
import asyncio
import json
from dataclasses import dataclass, field
# Configure logging
logger = logging.getLogger(__name__)
@dataclass
class NormalizerConfig:
"""Configuration class for enabling/disabling specific normalization rules"""
enable_escape_fix: bool = True # Fix excessive escape characters
enable_thought_tag_fix: bool = True # Normalize thought tags
enable_code_block_fix: bool = True # Fix code block formatting
enable_latex_fix: bool = True # Fix LaTeX formula formatting
enable_list_fix: bool = (
False # Fix list item newlines (default off as it can be aggressive)
)
enable_unclosed_block_fix: bool = True # Auto-close unclosed code blocks
enable_fullwidth_symbol_fix: bool = False # Fix full-width symbols in code blocks
enable_mermaid_fix: bool = True # Fix common Mermaid syntax errors
enable_heading_fix: bool = (
True # Fix missing space in headings (#Header -> # Header)
)
enable_table_fix: bool = True # Fix missing closing pipe in tables
enable_xml_tag_cleanup: bool = True # Cleanup leftover XML tags
# Custom cleaner functions (for advanced extension)
custom_cleaners: List[Callable[[str], str]] = field(default_factory=list)
class ContentNormalizer:
"""LLM Output Content Normalizer - Production Grade Implementation"""
# --- 1. Pre-compiled Regex Patterns (Performance Optimization) ---
_PATTERNS = {
# Code block prefix: if ``` is not at start of line or file
"code_block_prefix": re.compile(r"(?<!^)(?<!\n)(```)", re.MULTILINE),
# Code block suffix: ```lang followed by non-whitespace (no newline)
"code_block_suffix": re.compile(r"(```[\w\+\-\.]*)[ \t]+([^\n\r])"),
# Code block indent: whitespace at start of line + ```
"code_block_indent": re.compile(r"^[ \t]+(```)", re.MULTILINE),
# Thought tag: </thought> followed by optional whitespace/newlines
"thought_end": re.compile(
r"</(thought|think|thinking)>[ \t]*\n*", re.IGNORECASE
),
"thought_start": re.compile(r"<(thought|think|thinking)>", re.IGNORECASE),
# LaTeX block: \[ ... \]
"latex_bracket_block": re.compile(r"\\\[(.+?)\\\]", re.DOTALL),
# LaTeX inline: \( ... \)
"latex_paren_inline": re.compile(r"\\\((.+?)\\\)"),
# List item: non-newline + digit + dot + space
"list_item": re.compile(r"([^\n])(\d+\. )"),
# XML artifacts (e.g. Claude's)
"xml_artifacts": re.compile(
r"</?(?:antArtifact|antThinking|artifact)[^>]*>", re.IGNORECASE
),
# Mermaid: Match various node shapes and quote unquoted labels
# Fix "reverse optimization": Must precisely match shape delimiters to avoid breaking structure
# Priority: Longer delimiters match first
"mermaid_node": re.compile(
r"(\w+)\s*(?:"
r"(\(\(\()(?![\"])(.*?)(?<![\"])(\)\)\))|" # (((...))) Double Circle
r"(\(\()(?![\"])(.*?)(?<![\"])(\)\))|" # ((...)) Circle
r"(\(\[)(?![\"])(.*?)(?<![\"])(\]\))|" # ([...]) Stadium
r"(\[\()(?![\"])(.*?)(?<![\"])(\)\])|" # [(...)] Cylinder
r"(\[\[)(?![\"])(.*?)(?<![\"])(\]\])|" # [[...]] Subroutine
r"(\{\{)(?![\"])(.*?)(?<![\"])(\}\})|" # {{...}} Hexagon
r"(\[/)(?![\"])(.*?)(?<![\"])(/\])|" # [/.../] Parallelogram
r"(\[\\)(?![\"])(.*?)(?<![\"])(\\\])|" # [\...\] Parallelogram Alt
r"(\[/)(?![\"])(.*?)(?<![\"])(\\\])|" # [/...\] Trapezoid
r"(\[\\)(?![\"])(.*?)(?<![\"])(/\])|" # [\.../] Trapezoid Alt
r"(\()(?![\"])(.*?)(?<![\"])(\))|" # (...) Round
r"(\[)(?![\"])(.*?)(?<![\"])(\])|" # [...] Square
r"(\{)(?![\"])(.*?)(?<![\"])(\})|" # {...} Rhombus
r"(>)(?![\"])(.*?)(?<![\"])(\])" # >...] Asymmetric
r")"
),
# Heading: #Heading -> # Heading
"heading_space": re.compile(r"^(#+)([^ \n#])", re.MULTILINE),
# Table: | col1 | col2 -> | col1 | col2 |
"table_pipe": re.compile(r"^(\|.*[^|\n])$", re.MULTILINE),
}
def __init__(self, config: Optional[NormalizerConfig] = None):
self.config = config or NormalizerConfig()
self.applied_fixes = []
def normalize(self, content: str) -> str:
"""Main entry point: apply all normalization rules in order"""
self.applied_fixes = []
if not content:
return content
original_content = content # Keep a copy for logging
try:
# 1. Escape character fix (Must be first)
if self.config.enable_escape_fix:
original = content
content = self._fix_escape_characters(content)
if content != original:
self.applied_fixes.append("Fix Escape Chars")
# 2. Thought tag normalization
if self.config.enable_thought_tag_fix:
original = content
content = self._fix_thought_tags(content)
if content != original:
self.applied_fixes.append("Normalize Thought Tags")
# 3. Code block formatting fix
if self.config.enable_code_block_fix:
original = content
content = self._fix_code_blocks(content)
if content != original:
self.applied_fixes.append("Fix Code Blocks")
# 4. LaTeX formula normalization
if self.config.enable_latex_fix:
original = content
content = self._fix_latex_formulas(content)
if content != original:
self.applied_fixes.append("Normalize LaTeX")
# 5. List formatting fix
if self.config.enable_list_fix:
original = content
content = self._fix_list_formatting(content)
if content != original:
self.applied_fixes.append("Fix List Format")
# 6. Unclosed code block fix
if self.config.enable_unclosed_block_fix:
original = content
content = self._fix_unclosed_code_blocks(content)
if content != original:
self.applied_fixes.append("Close Code Blocks")
# 7. Full-width symbol fix (in code blocks only)
if self.config.enable_fullwidth_symbol_fix:
original = content
content = self._fix_fullwidth_symbols_in_code(content)
if content != original:
self.applied_fixes.append("Fix Full-width Symbols")
# 8. Mermaid syntax fix
if self.config.enable_mermaid_fix:
original = content
content = self._fix_mermaid_syntax(content)
if content != original:
self.applied_fixes.append("Fix Mermaid Syntax")
# 9. Heading fix
if self.config.enable_heading_fix:
original = content
content = self._fix_headings(content)
if content != original:
self.applied_fixes.append("Fix Headings")
# 10. Table fix
if self.config.enable_table_fix:
original = content
content = self._fix_tables(content)
if content != original:
self.applied_fixes.append("Fix Tables")
# 11. XML tag cleanup
if self.config.enable_xml_tag_cleanup:
original = content
content = self._cleanup_xml_tags(content)
if content != original:
self.applied_fixes.append("Cleanup XML Tags")
# 9. Custom cleaners
for cleaner in self.config.custom_cleaners:
original = content
content = cleaner(content)
if content != original:
self.applied_fixes.append("Custom Cleaner")
if self.applied_fixes:
logger.info(f"Markdown Normalizer Applied Fixes: {self.applied_fixes}")
logger.debug(
f"--- Original Content ---\n{original_content}\n------------------------"
)
logger.debug(
f"--- Normalized Content ---\n{content}\n--------------------------"
)
return content
except Exception as e:
# Production safeguard: return original content on error
logger.error(f"Content normalization failed: {e}", exc_info=True)
return content
def _fix_escape_characters(self, content: str) -> str:
"""Fix excessive escape characters"""
content = content.replace("\\r\\n", "\n")
content = content.replace("\\n", "\n")
content = content.replace("\\t", "\t")
content = content.replace("\\\\", "\\")
return content
def _fix_thought_tags(self, content: str) -> str:
"""Normalize thought tags: unify naming and fix spacing"""
# 1. Standardize start tag: <think>, <thinking> -> <thought>
content = self._PATTERNS["thought_start"].sub("<thought>", content)
# 2. Standardize end tag and ensure newlines: </think> -> </thought>\n\n
return self._PATTERNS["thought_end"].sub("</thought>\n\n", content)
def _fix_code_blocks(self, content: str) -> str:
"""Fix code block formatting (prefixes, suffixes, indentation)"""
# Remove indentation before code blocks
content = self._PATTERNS["code_block_indent"].sub(r"\1", content)
# Ensure newline before ```
content = self._PATTERNS["code_block_prefix"].sub(r"\n\1", content)
# Ensure newline after ```lang
content = self._PATTERNS["code_block_suffix"].sub(r"\1\n\2", content)
return content
def _fix_latex_formulas(self, content: str) -> str:
"""Normalize LaTeX formulas: \[ -> $$ (block), \( -> $ (inline)"""
content = self._PATTERNS["latex_bracket_block"].sub(r"$$\1$$", content)
content = self._PATTERNS["latex_paren_inline"].sub(r"$\1$", content)
return content
def _fix_list_formatting(self, content: str) -> str:
"""Fix missing newlines in lists (e.g., 'text1. item' -> 'text\\n1. item')"""
return self._PATTERNS["list_item"].sub(r"\1\n\2", content)
def _fix_unclosed_code_blocks(self, content: str) -> str:
"""Auto-close unclosed code blocks"""
if content.count("```") % 2 != 0:
content += "\n```"
return content
def _fix_fullwidth_symbols_in_code(self, content: str) -> str:
"""Convert full-width symbols to half-width inside code blocks"""
FULLWIDTH_MAP = {
"": ",",
"": ".",
"": "(",
"": ")",
"": "[",
"": "]",
"": ";",
"": ":",
"": "?",
"": "!",
'"': '"',
'"': '"',
""": "'", """: "'",
}
parts = content.split("```")
# Code block content is at odd indices: 1, 3, 5...
for i in range(1, len(parts), 2):
for full, half in FULLWIDTH_MAP.items():
parts[i] = parts[i].replace(full, half)
return "```".join(parts)
def _fix_mermaid_syntax(self, content: str) -> str:
"""Fix common Mermaid syntax errors while preserving node shapes"""
def replacer(match):
# Group 1 is ID
id_str = match.group(1)
# Find matching shape group
# Groups start at index 2, each shape has 3 groups (Open, Content, Close)
# We iterate to find the non-None one
groups = match.groups()
for i in range(1, len(groups), 3):
if groups[i] is not None:
open_char = groups[i]
content = groups[i + 1]
close_char = groups[i + 2]
# Escape quotes in content
content = content.replace('"', '\\"')
return f'{id_str}{open_char}"{content}"{close_char}'
return match.group(0)
parts = content.split("```")
for i in range(1, len(parts), 2):
# Check if it's a mermaid block
lang_line = parts[i].split("\n", 1)[0].strip().lower()
if "mermaid" in lang_line:
# Apply the comprehensive regex fix
parts[i] = self._PATTERNS["mermaid_node"].sub(replacer, parts[i])
# Auto-close subgraphs
subgraph_count = len(
re.findall(r"\bsubgraph\b", parts[i], re.IGNORECASE)
)
end_count = len(re.findall(r"\bend\b", parts[i], re.IGNORECASE))
if subgraph_count > end_count:
missing_ends = subgraph_count - end_count
parts[i] = parts[i].rstrip() + ("\n end" * missing_ends) + "\n"
return "```".join(parts)
def _fix_headings(self, content: str) -> str:
"""Fix missing space in headings: #Heading -> # Heading"""
# We only fix if it's not inside a code block.
# But splitting by code block is expensive.
# Given headings usually don't appear inside code blocks without space in valid code (except comments),
# we might risk false positives in comments like `#TODO`.
# To be safe, let's split by code blocks.
parts = content.split("```")
for i in range(0, len(parts), 2): # Even indices are markdown text
parts[i] = self._PATTERNS["heading_space"].sub(r"\1 \2", parts[i])
return "```".join(parts)
def _fix_tables(self, content: str) -> str:
"""Fix tables missing closing pipe"""
parts = content.split("```")
for i in range(0, len(parts), 2):
parts[i] = self._PATTERNS["table_pipe"].sub(r"\1|", parts[i])
return "```".join(parts)
def _cleanup_xml_tags(self, content: str) -> str:
"""Remove leftover XML tags"""
return self._PATTERNS["xml_artifacts"].sub("", content)
class Filter:
class Valves(BaseModel):
priority: int = Field(
default=50,
description="Priority level. Higher runs later (recommended to run after other filters).",
)
enable_escape_fix: bool = Field(
default=True, description="Fix excessive escape characters (\\n, \\t, etc.)"
)
enable_thought_tag_fix: bool = Field(
default=True, description="Normalize </thought> tags"
)
enable_code_block_fix: bool = Field(
default=True,
description="Fix code block formatting (indentation, newlines)",
)
enable_latex_fix: bool = Field(
default=True, description="Normalize LaTeX formulas (\\[ -> $$, \\( -> $)"
)
enable_list_fix: bool = Field(
default=False, description="Fix list item newlines (Experimental)"
)
enable_unclosed_block_fix: bool = Field(
default=True, description="Auto-close unclosed code blocks"
)
enable_fullwidth_symbol_fix: bool = Field(
default=False, description="Fix full-width symbols in code blocks"
)
enable_mermaid_fix: bool = Field(
default=True,
description="Fix common Mermaid syntax errors (e.g. unquoted labels)",
)
enable_heading_fix: bool = Field(
default=True,
description="Fix missing space in headings (#Header -> # Header)",
)
enable_table_fix: bool = Field(
default=True, description="Fix missing closing pipe in tables"
)
enable_xml_tag_cleanup: bool = Field(
default=True, description="Cleanup leftover XML tags"
)
show_status: bool = Field(
default=True, description="Show status notification when fixes are applied"
)
show_debug_log: bool = Field(
default=False, description="Print debug logs to browser console (F12)"
)
def __init__(self):
self.valves = self.Valves()
def _contains_html(self, content: str) -> bool:
"""Check if content contains HTML tags (to avoid breaking HTML output)"""
pattern = r"<\s*/?\s*(?:html|head|body|div|span|p|br|hr|ul|ol|li|table|thead|tbody|tfoot|tr|td|th|img|a|b|i|strong|em|code|pre|blockquote|h[1-6]|script|style|form|input|button|label|select|option|iframe|link|meta|title)\b"
return bool(re.search(pattern, content, re.IGNORECASE))
async def _emit_status(self, __event_emitter__, applied_fixes: List[str]):
"""Emit status notification"""
if not self.valves.show_status or not applied_fixes:
return
description = "✓ Markdown Normalized"
if applied_fixes:
description += f": {', '.join(applied_fixes)}"
try:
await __event_emitter__(
{
"type": "status",
"data": {
"description": description,
"done": True,
},
}
)
except Exception as e:
print(f"Error emitting status: {e}")
async def _emit_debug_log(
self, __event_call__, applied_fixes: List[str], original: str, normalized: str
):
"""Emit debug log to browser console via JS execution"""
if not self.valves.show_debug_log or not __event_call__:
return
try:
# Prepare data for JS
log_data = {
"fixes": applied_fixes,
"original": original,
"normalized": normalized,
}
# Construct JS code
js_code = f"""
(async function() {{
console.group("🛠️ Markdown Normalizer Debug");
console.log("Applied Fixes:", {json.dumps(applied_fixes, ensure_ascii=False)});
console.log("Original Content:", {json.dumps(original, ensure_ascii=False)});
console.log("Normalized Content:", {json.dumps(normalized, ensure_ascii=False)});
console.groupEnd();
}})();
"""
await __event_call__(
{
"type": "execute",
"data": {"code": js_code},
}
)
except Exception as e:
print(f"Error emitting debug log: {e}")
async def outlet(
self,
body: dict,
__user__: Optional[dict] = None,
__event_emitter__=None,
__event_call__=None,
__metadata__: Optional[dict] = None,
) -> dict:
"""
Process the response body to normalize Markdown content.
"""
if "messages" in body and body["messages"]:
last = body["messages"][-1]
content = last.get("content", "") or ""
if last.get("role") == "assistant" and isinstance(content, str):
# Skip if content looks like HTML to avoid breaking it
if self._contains_html(content):
return body
# Configure normalizer based on valves
config = NormalizerConfig(
enable_escape_fix=self.valves.enable_escape_fix,
enable_thought_tag_fix=self.valves.enable_thought_tag_fix,
enable_code_block_fix=self.valves.enable_code_block_fix,
enable_latex_fix=self.valves.enable_latex_fix,
enable_list_fix=self.valves.enable_list_fix,
enable_unclosed_block_fix=self.valves.enable_unclosed_block_fix,
enable_fullwidth_symbol_fix=self.valves.enable_fullwidth_symbol_fix,
enable_mermaid_fix=self.valves.enable_mermaid_fix,
enable_heading_fix=self.valves.enable_heading_fix,
enable_table_fix=self.valves.enable_table_fix,
enable_xml_tag_cleanup=self.valves.enable_xml_tag_cleanup,
)
normalizer = ContentNormalizer(config)
# Execute normalization
new_content = normalizer.normalize(content)
# Update content if changed
if new_content != content:
last["content"] = new_content
# Emit status if enabled
if __event_emitter__:
await self._emit_status(
__event_emitter__, normalizer.applied_fixes
)
await self._emit_debug_log(
__event_call__,
normalizer.applied_fixes,
content,
new_content,
)
return body

View File

@@ -0,0 +1,544 @@
"""
title: Markdown 格式修复器 (Markdown Normalizer)
author: Fu-Jie
author_url: https://github.com/Fu-Jie
funding_url: https://github.com/Fu-Jie/awesome-openwebui
version: 1.0.0
description: 生产级内容规范化过滤器,修复 LLM 输出中常见的 Markdown 格式问题如损坏的代码块、LaTeX 公式、Mermaid 图表和列表格式。
"""
from pydantic import BaseModel, Field
from typing import Optional, List, Callable
import re
import logging
import asyncio
import json
from dataclasses import dataclass, field
# Configure logging
logger = logging.getLogger(__name__)
@dataclass
class NormalizerConfig:
"""配置类,用于启用/禁用特定的规范化规则"""
enable_escape_fix: bool = True # 修复过度的转义字符
enable_thought_tag_fix: bool = True # 规范化思维链标签
enable_code_block_fix: bool = True # 修复代码块格式
enable_latex_fix: bool = True # 修复 LaTeX 公式格式
enable_list_fix: bool = False # 修复列表项换行 (默认关闭,因为可能过于激进)
enable_unclosed_block_fix: bool = True # 自动闭合未闭合的代码块
enable_fullwidth_symbol_fix: bool = False # 修复代码块中的全角符号
enable_mermaid_fix: bool = True # 修复常见的 Mermaid 语法错误
enable_heading_fix: bool = True # 修复标题中缺失的空格 (#Header -> # Header)
enable_table_fix: bool = True # 修复表格中缺失的闭合管道符
enable_xml_tag_cleanup: bool = True # 清理残留的 XML 标签
# 自定义清理函数 (用于高级扩展)
custom_cleaners: List[Callable[[str], str]] = field(default_factory=list)
class ContentNormalizer:
"""LLM Output Content Normalizer - Production Grade Implementation"""
# --- 1. Pre-compiled Regex Patterns (Performance Optimization) ---
_PATTERNS = {
# Code block prefix: if ``` is not at start of line or file
"code_block_prefix": re.compile(r"(?<!^)(?<!\n)(```)", re.MULTILINE),
# Code block suffix: ```lang followed by non-whitespace (no newline)
"code_block_suffix": re.compile(r"(```[\w\+\-\.]*)[ \t]+([^\n\r])"),
# Code block indent: whitespace at start of line + ```
"code_block_indent": re.compile(r"^[ \t]+(```)", re.MULTILINE),
# Thought tag: </thought> followed by optional whitespace/newlines
"thought_end": re.compile(
r"</(thought|think|thinking)>[ \t]*\n*", re.IGNORECASE
),
"thought_start": re.compile(r"<(thought|think|thinking)>", re.IGNORECASE),
# LaTeX block: \[ ... \]
"latex_bracket_block": re.compile(r"\\\[(.+?)\\\]", re.DOTALL),
# LaTeX inline: \( ... \)
"latex_paren_inline": re.compile(r"\\\((.+?)\\\)"),
# List item: non-newline + digit + dot + space
"list_item": re.compile(r"([^\n])(\d+\. )"),
# XML artifacts (e.g. Claude's)
"xml_artifacts": re.compile(
r"</?(?:antArtifact|antThinking|artifact)[^>]*>", re.IGNORECASE
),
# Mermaid: 匹配各种形状的节点并为未加引号的标签添加引号
# 修复"反向优化"问题:必须精确匹配各种形状的定界符,避免破坏形状结构
# 优先级:长定界符优先匹配
"mermaid_node": re.compile(
r"(\w+)\s*(?:"
r"(\(\(\()(?![\"])(.*?)(?<![\"])(\)\)\))|" # (((...))) Double Circle
r"(\(\()(?![\"])(.*?)(?<![\"])(\)\))|" # ((...)) Circle
r"(\(\[)(?![\"])(.*?)(?<![\"])(\]\))|" # ([...]) Stadium
r"(\[\()(?![\"])(.*?)(?<![\"])(\)\])|" # [(...)] Cylinder
r"(\[\[)(?![\"])(.*?)(?<![\"])(\]\])|" # [[...]] Subroutine
r"(\{\{)(?![\"])(.*?)(?<![\"])(\}\})|" # {{...}} Hexagon
r"(\[/)(?![\"])(.*?)(?<![\"])(/\])|" # [/.../] Parallelogram
r"(\[\\)(?![\"])(.*?)(?<![\"])(\\\])|" # [\...\] Parallelogram Alt
r"(\[/)(?![\"])(.*?)(?<![\"])(\\\])|" # [/...\] Trapezoid
r"(\[\\)(?![\"])(.*?)(?<![\"])(/\])|" # [\.../] Trapezoid Alt
r"(\()(?![\"])(.*?)(?<![\"])(\))|" # (...) Round
r"(\[)(?![\"])(.*?)(?<![\"])(\])|" # [...] Square
r"(\{)(?![\"])(.*?)(?<![\"])(\})|" # {...} Rhombus
r"(>)(?![\"])(.*?)(?<![\"])(\])" # >...] Asymmetric
r")"
),
# Heading: #Heading -> # Heading
"heading_space": re.compile(r"^(#+)([^ \n#])", re.MULTILINE),
# Table: | col1 | col2 -> | col1 | col2 |
"table_pipe": re.compile(r"^(\|.*[^|\n])$", re.MULTILINE),
}
def __init__(self, config: Optional[NormalizerConfig] = None):
self.config = config or NormalizerConfig()
self.applied_fixes = []
def normalize(self, content: str) -> str:
"""Main entry point: apply all normalization rules in order"""
self.applied_fixes = []
if not content:
return content
original_content = content # Keep a copy for logging
try:
# 1. Escape character fix (Must be first)
if self.config.enable_escape_fix:
original = content
content = self._fix_escape_characters(content)
if content != original:
self.applied_fixes.append("Fix Escape Chars")
# 2. Thought tag normalization
if self.config.enable_thought_tag_fix:
original = content
content = self._fix_thought_tags(content)
if content != original:
self.applied_fixes.append("Normalize Thought Tags")
# 3. Code block formatting fix
if self.config.enable_code_block_fix:
original = content
content = self._fix_code_blocks(content)
if content != original:
self.applied_fixes.append("Fix Code Blocks")
# 4. LaTeX formula normalization
if self.config.enable_latex_fix:
original = content
content = self._fix_latex_formulas(content)
if content != original:
self.applied_fixes.append("Normalize LaTeX")
# 5. List formatting fix
if self.config.enable_list_fix:
original = content
content = self._fix_list_formatting(content)
if content != original:
self.applied_fixes.append("Fix List Format")
# 6. Unclosed code block fix
if self.config.enable_unclosed_block_fix:
original = content
content = self._fix_unclosed_code_blocks(content)
if content != original:
self.applied_fixes.append("Close Code Blocks")
# 7. Full-width symbol fix (in code blocks only)
if self.config.enable_fullwidth_symbol_fix:
original = content
content = self._fix_fullwidth_symbols_in_code(content)
if content != original:
self.applied_fixes.append("Fix Full-width Symbols")
# 8. Mermaid syntax fix
if self.config.enable_mermaid_fix:
original = content
content = self._fix_mermaid_syntax(content)
if content != original:
self.applied_fixes.append("Fix Mermaid Syntax")
# 9. Heading fix
if self.config.enable_heading_fix:
original = content
content = self._fix_headings(content)
if content != original:
self.applied_fixes.append("Fix Headings")
# 10. Table fix
if self.config.enable_table_fix:
original = content
content = self._fix_tables(content)
if content != original:
self.applied_fixes.append("Fix Tables")
# 11. XML tag cleanup
if self.config.enable_xml_tag_cleanup:
original = content
content = self._cleanup_xml_tags(content)
if content != original:
self.applied_fixes.append("Cleanup XML Tags")
# 9. Custom cleaners
for cleaner in self.config.custom_cleaners:
original = content
content = cleaner(content)
if content != original:
self.applied_fixes.append("Custom Cleaner")
if self.applied_fixes:
print(f"[Markdown Normalizer] Applied fixes: {self.applied_fixes}")
print(
f"[Markdown Normalizer] --- Original Content ---\n{original_content}\n------------------------"
)
print(
f"[Markdown Normalizer] --- Normalized Content ---\n{content}\n--------------------------"
)
return content
except Exception as e:
# Production safeguard: return original content on error
logger.error(f"Content normalization failed: {e}", exc_info=True)
return content
def _fix_escape_characters(self, content: str) -> str:
"""Fix excessive escape characters"""
content = content.replace("\\r\\n", "\n")
content = content.replace("\\n", "\n")
content = content.replace("\\t", "\t")
content = content.replace("\\\\", "\\")
return content
def _fix_thought_tags(self, content: str) -> str:
"""Normalize thought tags: unify naming and fix spacing"""
# 1. Standardize start tag: <think>, <thinking> -> <thought>
content = self._PATTERNS["thought_start"].sub("<thought>", content)
# 2. Standardize end tag and ensure newlines: </think> -> </thought>\n\n
return self._PATTERNS["thought_end"].sub("</thought>\n\n", content)
def _fix_code_blocks(self, content: str) -> str:
"""Fix code block formatting (prefixes, suffixes, indentation)"""
# Remove indentation before code blocks
content = self._PATTERNS["code_block_indent"].sub(r"\1", content)
# Ensure newline before ```
content = self._PATTERNS["code_block_prefix"].sub(r"\n\1", content)
# Ensure newline after ```lang
content = self._PATTERNS["code_block_suffix"].sub(r"\1\n\2", content)
return content
def _fix_latex_formulas(self, content: str) -> str:
"""Normalize LaTeX formulas: \[ -> $$ (block), \( -> $ (inline)"""
content = self._PATTERNS["latex_bracket_block"].sub(r"$$\1$$", content)
content = self._PATTERNS["latex_paren_inline"].sub(r"$\1$", content)
return content
def _fix_list_formatting(self, content: str) -> str:
"""Fix missing newlines in lists (e.g., 'text1. item' -> 'text\\n1. item')"""
return self._PATTERNS["list_item"].sub(r"\1\n\2", content)
def _fix_unclosed_code_blocks(self, content: str) -> str:
"""Auto-close unclosed code blocks"""
if content.count("```") % 2 != 0:
content += "\n```"
return content
def _fix_fullwidth_symbols_in_code(self, content: str) -> str:
"""Convert full-width symbols to half-width inside code blocks"""
FULLWIDTH_MAP = {
"": ",",
"": ".",
"": "(",
"": ")",
"": "[",
"": "]",
"": ";",
"": ":",
"": "?",
"": "!",
'"': '"',
'"': '"',
""": "'", """: "'",
}
parts = content.split("```")
# Code block content is at odd indices: 1, 3, 5...
for i in range(1, len(parts), 2):
for full, half in FULLWIDTH_MAP.items():
parts[i] = parts[i].replace(full, half)
return "```".join(parts)
def _fix_mermaid_syntax(self, content: str) -> str:
"""修复常见的 Mermaid 语法错误,同时保留节点形状"""
def replacer(match):
# Group 1 是 ID
id_str = match.group(1)
# 查找匹配的形状组
# 组从索引 2 开始,每个形状有 3 个组 (Open, Content, Close)
# 我们遍历找到非 None 的那一组
groups = match.groups()
for i in range(1, len(groups), 3):
if groups[i] is not None:
open_char = groups[i]
content = groups[i + 1]
close_char = groups[i + 2]
# 如果内容包含引号,进行转义
content = content.replace('"', '\\"')
return f'{id_str}{open_char}"{content}"{close_char}'
return match.group(0)
parts = content.split("```")
for i in range(1, len(parts), 2):
# Check if it's a mermaid block
lang_line = parts[i].split("\n", 1)[0].strip().lower()
if "mermaid" in lang_line:
# Apply the comprehensive regex fix
parts[i] = self._PATTERNS["mermaid_node"].sub(replacer, parts[i])
# Auto-close subgraphs
# Count 'subgraph' and 'end' (case-insensitive)
# We use a simple regex to avoid matching words inside labels (though labels are now quoted, so it's safer)
# But for simplicity and speed, we just count occurrences in the whole block.
# A more robust way would be to strip quoted strings first, but that's expensive.
# Given we just quoted everything, let's try to count keywords outside quotes?
# Actually, since we just normalized nodes, most text is in quotes.
# Let's just do a simple count. It's a heuristic fix.
subgraph_count = len(
re.findall(r"\bsubgraph\b", parts[i], re.IGNORECASE)
)
end_count = len(re.findall(r"\bend\b", parts[i], re.IGNORECASE))
if subgraph_count > end_count:
missing_ends = subgraph_count - end_count
parts[i] = parts[i].rstrip() + ("\n end" * missing_ends) + "\n"
return "```".join(parts)
def _fix_headings(self, content: str) -> str:
"""Fix missing space in headings: #Heading -> # Heading"""
# We only fix if it's not inside a code block.
# But splitting by code block is expensive.
# Given headings usually don't appear inside code blocks without space in valid code (except comments),
# we might risk false positives in comments like `#TODO`.
# To be safe, let's split by code blocks.
parts = content.split("```")
for i in range(0, len(parts), 2): # Even indices are markdown text
parts[i] = self._PATTERNS["heading_space"].sub(r"\1 \2", parts[i])
return "```".join(parts)
def _fix_tables(self, content: str) -> str:
"""Fix tables missing closing pipe"""
parts = content.split("```")
for i in range(0, len(parts), 2):
parts[i] = self._PATTERNS["table_pipe"].sub(r"\1|", parts[i])
return "```".join(parts)
def _cleanup_xml_tags(self, content: str) -> str:
"""Remove leftover XML tags"""
return self._PATTERNS["xml_artifacts"].sub("", content)
class Filter:
class Valves(BaseModel):
priority: int = Field(
default=50,
description="优先级。数值越高运行越晚 (建议在其他过滤器之后运行)。",
)
enable_escape_fix: bool = Field(
default=True, description="修复过度的转义字符 (\\n, \\t 等)"
)
enable_thought_tag_fix: bool = Field(
default=True, description="规范化思维链标签 (<think> -> <thought>)"
)
enable_code_block_fix: bool = Field(
default=True,
description="修复代码块格式 (缩进、换行)",
)
enable_latex_fix: bool = Field(
default=True, description="规范化 LaTeX 公式 (\\[ -> $$, \\( -> $)"
)
enable_list_fix: bool = Field(
default=False, description="修复列表项换行 (实验性)"
)
enable_unclosed_block_fix: bool = Field(
default=True, description="自动闭合未闭合的代码块"
)
enable_fullwidth_symbol_fix: bool = Field(
default=False, description="修复代码块中的全角符号"
)
enable_mermaid_fix: bool = Field(
default=True,
description="修复常见的 Mermaid 语法错误 (如未加引号的标签)",
)
enable_heading_fix: bool = Field(
default=True,
description="修复标题中缺失的空格 (#Header -> # Header)",
)
enable_table_fix: bool = Field(
default=True, description="修复表格中缺失的闭合管道符"
)
enable_xml_tag_cleanup: bool = Field(
default=True, description="清理残留的 XML 标签"
)
show_status: bool = Field(default=True, description="应用修复时显示状态通知")
show_debug_log: bool = Field(
default=False, description="在浏览器控制台打印调试日志 (F12)"
)
def __init__(self):
self.valves = self.Valves()
def _contains_html(self, content: str) -> bool:
"""Check if content contains HTML tags (to avoid breaking HTML output)"""
pattern = r"<\s*/?\s*(?:html|head|body|div|span|p|br|hr|ul|ol|li|table|thead|tbody|tfoot|tr|td|th|img|a|b|i|strong|em|code|pre|blockquote|h[1-6]|script|style|form|input|button|label|select|option|iframe|link|meta|title)\b"
return bool(re.search(pattern, content, re.IGNORECASE))
async def _emit_status(self, __event_emitter__, applied_fixes: List[str]):
"""Emit status notification"""
if not self.valves.show_status or not applied_fixes:
return
description = "✓ Markdown 已修复"
if applied_fixes:
# Translate fix names for status display
fix_map = {
"Fix Escape Chars": "转义字符",
"Normalize Thought Tags": "思维标签",
"Fix Code Blocks": "代码块",
"Normalize LaTeX": "LaTeX公式",
"Fix List Format": "列表格式",
"Close Code Blocks": "闭合代码块",
"Fix Full-width Symbols": "全角符号",
"Fix Mermaid Syntax": "Mermaid语法",
"Fix Headings": "标题格式",
"Fix Tables": "表格格式",
"Cleanup XML Tags": "XML清理",
"Custom Cleaner": "自定义清理",
}
translated_fixes = [fix_map.get(fix, fix) for fix in applied_fixes]
description += f": {', '.join(translated_fixes)}"
try:
await __event_emitter__(
{
"type": "status",
"data": {
"description": description,
"done": True,
},
}
)
except Exception as e:
print(f"Error emitting status: {e}")
async def _emit_debug_log(
self,
__event_emitter__,
applied_fixes: List[str],
original: str,
normalized: str,
):
"""Emit debug log to browser console via JS execution"""
async def _emit_debug_log(
self, __event_call__, applied_fixes: List[str], original: str, normalized: str
):
"""Emit debug log to browser console via JS execution"""
if not self.valves.show_debug_log or not __event_call__:
return
try:
# Prepare data for JS
log_data = {
"fixes": applied_fixes,
"original": original,
"normalized": normalized,
}
# Construct JS code
js_code = f"""
(async function() {{
console.group("🛠️ Markdown Normalizer Debug");
console.log("Applied Fixes:", {json.dumps(applied_fixes, ensure_ascii=False)});
console.log("Original Content:", {json.dumps(original, ensure_ascii=False)});
console.log("Normalized Content:", {json.dumps(normalized, ensure_ascii=False)});
console.groupEnd();
}})();
"""
await __event_call__(
{
"type": "execute",
"data": {"code": js_code},
}
)
except Exception as e:
print(f"Error emitting debug log: {e}")
async def outlet(
self,
body: dict,
__user__: Optional[dict] = None,
__event_emitter__=None,
__event_call__=None,
__metadata__: Optional[dict] = None,
) -> dict:
"""
Process the response body to normalize Markdown content.
"""
if "messages" in body and body["messages"]:
last = body["messages"][-1]
content = last.get("content", "") or ""
if last.get("role") == "assistant" and isinstance(content, str):
# Skip if content looks like HTML to avoid breaking it
if self._contains_html(content):
return body
# Configure normalizer based on valves
config = NormalizerConfig(
enable_escape_fix=self.valves.enable_escape_fix,
enable_thought_tag_fix=self.valves.enable_thought_tag_fix,
enable_code_block_fix=self.valves.enable_code_block_fix,
enable_latex_fix=self.valves.enable_latex_fix,
enable_list_fix=self.valves.enable_list_fix,
enable_unclosed_block_fix=self.valves.enable_unclosed_block_fix,
enable_fullwidth_symbol_fix=self.valves.enable_fullwidth_symbol_fix,
enable_mermaid_fix=self.valves.enable_mermaid_fix,
enable_heading_fix=self.valves.enable_heading_fix,
enable_table_fix=self.valves.enable_table_fix,
enable_xml_tag_cleanup=self.valves.enable_xml_tag_cleanup,
)
normalizer = ContentNormalizer(config)
# Execute normalization
new_content = normalizer.normalize(content)
# Update content if changed
if new_content != content:
last["content"] = new_content
# Emit status if enabled
if __event_emitter__:
await self._emit_status(
__event_emitter__, normalizer.applied_fixes
)
await self._emit_debug_log(
__event_call__,
normalizer.applied_fixes,
content,
new_content,
)
return body

View File

@@ -0,0 +1,191 @@
import unittest
import sys
import os
# Add the current directory to sys.path to import the module
current_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(current_dir)
from markdown_normalizer import ContentNormalizer, NormalizerConfig
class TestMarkdownNormalizer(unittest.TestCase):
def setUp(self):
self.config = NormalizerConfig(
enable_escape_fix=True,
enable_thought_tag_fix=True,
enable_code_block_fix=True,
enable_latex_fix=True,
enable_list_fix=True,
enable_unclosed_block_fix=True,
enable_fullwidth_symbol_fix=True,
enable_mermaid_fix=True,
enable_xml_tag_cleanup=True,
)
self.normalizer = ContentNormalizer(self.config)
def test_escape_fix(self):
input_text = "Line 1\\nLine 2\\tTabbed"
expected = "Line 1\nLine 2\tTabbed"
self.assertEqual(self.normalizer.normalize(input_text), expected)
def test_thought_tag_fix(self):
# Case 1: Standard tag spacing
input_text = "Thinking...</thought>Result"
expected = "Thinking...</thought>\n\nResult"
self.assertEqual(self.normalizer.normalize(input_text), expected)
# Case 2: Tag standardization (<think> -> <thought>)
input_text_deepseek = "<think>Deep thinking...</think>Result"
expected_deepseek = "<thought>Deep thinking...</thought>\n\nResult"
self.assertEqual(
self.normalizer.normalize(input_text_deepseek), expected_deepseek
)
def test_code_block_fix(self):
# Case 1: Indentation
self.assertEqual(self.normalizer._fix_code_blocks(" ```python"), "```python")
# Case 2: Prefix (newline before block)
self.assertEqual(
self.normalizer._fix_code_blocks("Text```python"), "Text\n```python"
)
# Case 3: Suffix (newline after lang)
self.assertEqual(
self.normalizer._fix_code_blocks("```python print('hi')"),
"```python\nprint('hi')",
)
def test_latex_fix(self):
input_text = "Block: \\[ x^2 \\] Inline: \\( E=mc^2 \\)"
expected = "Block: $$ x^2 $$ Inline: $ E=mc^2 $"
self.assertEqual(self.normalizer.normalize(input_text), expected)
def test_list_fix(self):
input_text = "Item 1. First\nItem 2. Second" # This is fine
input_text_bad = "Header1. Item 1"
expected = "Header\n1. Item 1"
self.assertEqual(self.normalizer.normalize(input_text_bad), expected)
def test_unclosed_code_block_fix(self):
input_text = "```python\nprint('hello')"
expected = "```python\nprint('hello')\n```"
self.assertEqual(self.normalizer.normalize(input_text), expected)
def test_fullwidth_symbol_fix(self):
input_text = "OutsideFullwidth ```python\nprint'hello'```"
expected = "OutsideFullwidth \n```python\nprint('hello')\n```"
normalized = self.normalizer.normalize(input_text)
self.assertIn("print('hello')", normalized)
self.assertIn("OutsideFullwidth", normalized)
self.assertNotIn("", normalized)
self.assertNotIn("", normalized)
def test_mermaid_fix(self):
# Test Mermaid syntax fix for unquoted labels
# Note: Regex-based fix handles mixed brackets well (e.g. [] inside ())
# but cannot perfectly handle same-type nesting (e.g. {} inside {}) without a parser.
input_text = """
```mermaid
graph TD
A[Label with (parens)] --> B(Label with [brackets])
C{Label with [brackets]}
```
"""
expected_snippet = """
```mermaid
graph TD
A["Label with (parens)"] --> B("Label with [brackets]")
C{"Label with [brackets]"}
```
"""
normalized = self.normalizer.normalize(input_text)
self.assertIn('A["Label with (parens)"]', normalized)
self.assertIn('B("Label with [brackets]")', normalized)
self.assertIn('C{"Label with [brackets]"}', normalized)
def test_mermaid_shapes_regression(self):
# Regression test for "reverse optimization" where ((...)) was broken into ("(...)")
input_text = """
```mermaid
graph TD
Start((开始)) --> Input[[输入]]
Input --> Verify{验证}
Verify --> End(((结束)))
```
"""
expected_snippet = """
```mermaid
graph TD
Start(("开始")) --> Input[["输入"]]
Input --> Verify{"验证"}
Verify --> End((("结束")))
```
"""
normalized = self.normalizer.normalize(input_text)
self.assertIn('Start(("开始"))', normalized)
self.assertIn('Input[["输入"]]', normalized)
self.assertIn('Verify{"验证"}', normalized)
self.assertIn('End((("结束")))', normalized)
def test_xml_cleanup(self):
input_text = "Some text <antArtifact>hidden</antArtifact> visible"
expected = "Some text hidden visible"
self.assertEqual(self.normalizer.normalize(input_text), expected)
def test_heading_fix(self):
input_text = "#Heading 1\n##Heading 2\n### Valid Heading"
expected = "# Heading 1\n## Heading 2\n### Valid Heading"
self.assertEqual(self.normalizer.normalize(input_text), expected)
def test_table_fix(self):
input_text = "| Col 1 | Col 2\n| Val 1 | Val 2"
expected = "| Col 1 | Col 2|\n| Val 1 | Val 2|"
self.assertEqual(self.normalizer.normalize(input_text), expected)
def test_mermaid_subgraph_autoclose(self):
"""Test auto-closing of Mermaid subgraphs"""
# Case 1: Simple unclosed subgraph
original = """
```mermaid
graph TD
subgraph One
A --> B
```
"""
expected = """
```mermaid
graph TD
subgraph One
A --> B
end
```
"""
# Note: The normalizer might add quotes to A and B if they match the node pattern,
# but here they are simple IDs. However, our regex is strict about shapes.
# Simple IDs like A and B are NOT matched by our mermaid_node regex because it requires a shape delimiter.
# So A and B remain A and B.
normalized = self.normalizer.normalize(original)
# We need to be careful about whitespace in comparison
self.assertIn("end", normalized)
self.assertEqual(normalized.strip(), expected.strip())
# Case 2: Nested unclosed subgraphs
original_nested = """
```mermaid
graph TD
subgraph Outer
subgraph Inner
C --> D
```
"""
normalized_nested = self.normalizer.normalize(original_nested)
self.assertEqual(normalized_nested.count("end"), 2)
if __name__ == "__main__":
unittest.main()