feat: enhance markdown normalizer with mermaid fix and frontend logging
This commit is contained in:
519
plugins/filters/markdown_normalizer/markdown_normalizer.py
Normal file
519
plugins/filters/markdown_normalizer/markdown_normalizer.py
Normal file
@@ -0,0 +1,519 @@
|
||||
"""
|
||||
title: Markdown Normalizer
|
||||
author: Fu-Jie
|
||||
author_url: https://github.com/Fu-Jie
|
||||
funding_url: https://github.com/Fu-Jie/awesome-openwebui
|
||||
version: 1.0.0
|
||||
description: A production-grade content normalizer filter that fixes common Markdown formatting issues in LLM outputs, such as broken code blocks, LaTeX formulas, and list formatting.
|
||||
"""
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Optional, List, Callable
|
||||
import re
|
||||
import logging
|
||||
import logging
|
||||
import asyncio
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
# Configure logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NormalizerConfig:
|
||||
"""Configuration class for enabling/disabling specific normalization rules"""
|
||||
|
||||
enable_escape_fix: bool = True # Fix excessive escape characters
|
||||
enable_thought_tag_fix: bool = True # Normalize thought tags
|
||||
enable_code_block_fix: bool = True # Fix code block formatting
|
||||
enable_latex_fix: bool = True # Fix LaTeX formula formatting
|
||||
enable_list_fix: bool = (
|
||||
False # Fix list item newlines (default off as it can be aggressive)
|
||||
)
|
||||
enable_unclosed_block_fix: bool = True # Auto-close unclosed code blocks
|
||||
enable_fullwidth_symbol_fix: bool = False # Fix full-width symbols in code blocks
|
||||
enable_mermaid_fix: bool = True # Fix common Mermaid syntax errors
|
||||
enable_heading_fix: bool = (
|
||||
True # Fix missing space in headings (#Header -> # Header)
|
||||
)
|
||||
enable_table_fix: bool = True # Fix missing closing pipe in tables
|
||||
enable_xml_tag_cleanup: bool = True # Cleanup leftover XML tags
|
||||
|
||||
# Custom cleaner functions (for advanced extension)
|
||||
custom_cleaners: List[Callable[[str], str]] = field(default_factory=list)
|
||||
|
||||
|
||||
class ContentNormalizer:
|
||||
"""LLM Output Content Normalizer - Production Grade Implementation"""
|
||||
|
||||
# --- 1. Pre-compiled Regex Patterns (Performance Optimization) ---
|
||||
_PATTERNS = {
|
||||
# Code block prefix: if ``` is not at start of line or file
|
||||
"code_block_prefix": re.compile(r"(?<!^)(?<!\n)(```)", re.MULTILINE),
|
||||
# Code block suffix: ```lang followed by non-whitespace (no newline)
|
||||
"code_block_suffix": re.compile(r"(```[\w\+\-\.]*)[ \t]+([^\n\r])"),
|
||||
# Code block indent: whitespace at start of line + ```
|
||||
"code_block_indent": re.compile(r"^[ \t]+(```)", re.MULTILINE),
|
||||
# Thought tag: </thought> followed by optional whitespace/newlines
|
||||
"thought_end": re.compile(
|
||||
r"</(thought|think|thinking)>[ \t]*\n*", re.IGNORECASE
|
||||
),
|
||||
"thought_start": re.compile(r"<(thought|think|thinking)>", re.IGNORECASE),
|
||||
# LaTeX block: \[ ... \]
|
||||
"latex_bracket_block": re.compile(r"\\\[(.+?)\\\]", re.DOTALL),
|
||||
# LaTeX inline: \( ... \)
|
||||
"latex_paren_inline": re.compile(r"\\\((.+?)\\\)"),
|
||||
# List item: non-newline + digit + dot + space
|
||||
"list_item": re.compile(r"([^\n])(\d+\. )"),
|
||||
# XML artifacts (e.g. Claude's)
|
||||
"xml_artifacts": re.compile(
|
||||
r"</?(?:antArtifact|antThinking|artifact)[^>]*>", re.IGNORECASE
|
||||
),
|
||||
# Mermaid: Match various node shapes and quote unquoted labels
|
||||
# Fix "reverse optimization": Must precisely match shape delimiters to avoid breaking structure
|
||||
# Priority: Longer delimiters match first
|
||||
"mermaid_node": re.compile(
|
||||
r"(\w+)\s*(?:"
|
||||
r"(\(\(\()(?![\"])(.*?)(?<![\"])(\)\)\))|" # (((...))) Double Circle
|
||||
r"(\(\()(?![\"])(.*?)(?<![\"])(\)\))|" # ((...)) Circle
|
||||
r"(\(\[)(?![\"])(.*?)(?<![\"])(\]\))|" # ([...]) Stadium
|
||||
r"(\[\()(?![\"])(.*?)(?<![\"])(\)\])|" # [(...)] Cylinder
|
||||
r"(\[\[)(?![\"])(.*?)(?<![\"])(\]\])|" # [[...]] Subroutine
|
||||
r"(\{\{)(?![\"])(.*?)(?<![\"])(\}\})|" # {{...}} Hexagon
|
||||
r"(\[/)(?![\"])(.*?)(?<![\"])(/\])|" # [/.../] Parallelogram
|
||||
r"(\[\\)(?![\"])(.*?)(?<![\"])(\\\])|" # [\...\] Parallelogram Alt
|
||||
r"(\[/)(?![\"])(.*?)(?<![\"])(\\\])|" # [/...\] Trapezoid
|
||||
r"(\[\\)(?![\"])(.*?)(?<![\"])(/\])|" # [\.../] Trapezoid Alt
|
||||
r"(\()(?![\"])(.*?)(?<![\"])(\))|" # (...) Round
|
||||
r"(\[)(?![\"])(.*?)(?<![\"])(\])|" # [...] Square
|
||||
r"(\{)(?![\"])(.*?)(?<![\"])(\})|" # {...} Rhombus
|
||||
r"(>)(?![\"])(.*?)(?<![\"])(\])" # >...] Asymmetric
|
||||
r")"
|
||||
),
|
||||
# Heading: #Heading -> # Heading
|
||||
"heading_space": re.compile(r"^(#+)([^ \n#])", re.MULTILINE),
|
||||
# Table: | col1 | col2 -> | col1 | col2 |
|
||||
"table_pipe": re.compile(r"^(\|.*[^|\n])$", re.MULTILINE),
|
||||
}
|
||||
|
||||
def __init__(self, config: Optional[NormalizerConfig] = None):
|
||||
self.config = config or NormalizerConfig()
|
||||
self.applied_fixes = []
|
||||
|
||||
def normalize(self, content: str) -> str:
|
||||
"""Main entry point: apply all normalization rules in order"""
|
||||
self.applied_fixes = []
|
||||
if not content:
|
||||
return content
|
||||
|
||||
original_content = content # Keep a copy for logging
|
||||
|
||||
try:
|
||||
# 1. Escape character fix (Must be first)
|
||||
if self.config.enable_escape_fix:
|
||||
original = content
|
||||
content = self._fix_escape_characters(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Fix Escape Chars")
|
||||
|
||||
# 2. Thought tag normalization
|
||||
if self.config.enable_thought_tag_fix:
|
||||
original = content
|
||||
content = self._fix_thought_tags(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Normalize Thought Tags")
|
||||
|
||||
# 3. Code block formatting fix
|
||||
if self.config.enable_code_block_fix:
|
||||
original = content
|
||||
content = self._fix_code_blocks(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Fix Code Blocks")
|
||||
|
||||
# 4. LaTeX formula normalization
|
||||
if self.config.enable_latex_fix:
|
||||
original = content
|
||||
content = self._fix_latex_formulas(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Normalize LaTeX")
|
||||
|
||||
# 5. List formatting fix
|
||||
if self.config.enable_list_fix:
|
||||
original = content
|
||||
content = self._fix_list_formatting(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Fix List Format")
|
||||
|
||||
# 6. Unclosed code block fix
|
||||
if self.config.enable_unclosed_block_fix:
|
||||
original = content
|
||||
content = self._fix_unclosed_code_blocks(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Close Code Blocks")
|
||||
|
||||
# 7. Full-width symbol fix (in code blocks only)
|
||||
if self.config.enable_fullwidth_symbol_fix:
|
||||
original = content
|
||||
content = self._fix_fullwidth_symbols_in_code(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Fix Full-width Symbols")
|
||||
|
||||
# 8. Mermaid syntax fix
|
||||
if self.config.enable_mermaid_fix:
|
||||
original = content
|
||||
content = self._fix_mermaid_syntax(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Fix Mermaid Syntax")
|
||||
|
||||
# 9. Heading fix
|
||||
if self.config.enable_heading_fix:
|
||||
original = content
|
||||
content = self._fix_headings(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Fix Headings")
|
||||
|
||||
# 10. Table fix
|
||||
if self.config.enable_table_fix:
|
||||
original = content
|
||||
content = self._fix_tables(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Fix Tables")
|
||||
|
||||
# 11. XML tag cleanup
|
||||
if self.config.enable_xml_tag_cleanup:
|
||||
original = content
|
||||
content = self._cleanup_xml_tags(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Cleanup XML Tags")
|
||||
|
||||
# 9. Custom cleaners
|
||||
for cleaner in self.config.custom_cleaners:
|
||||
original = content
|
||||
content = cleaner(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Custom Cleaner")
|
||||
|
||||
if self.applied_fixes:
|
||||
logger.info(f"Markdown Normalizer Applied Fixes: {self.applied_fixes}")
|
||||
logger.debug(
|
||||
f"--- Original Content ---\n{original_content}\n------------------------"
|
||||
)
|
||||
logger.debug(
|
||||
f"--- Normalized Content ---\n{content}\n--------------------------"
|
||||
)
|
||||
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
# Production safeguard: return original content on error
|
||||
logger.error(f"Content normalization failed: {e}", exc_info=True)
|
||||
return content
|
||||
|
||||
def _fix_escape_characters(self, content: str) -> str:
|
||||
"""Fix excessive escape characters"""
|
||||
content = content.replace("\\r\\n", "\n")
|
||||
content = content.replace("\\n", "\n")
|
||||
content = content.replace("\\t", "\t")
|
||||
content = content.replace("\\\\", "\\")
|
||||
return content
|
||||
|
||||
def _fix_thought_tags(self, content: str) -> str:
|
||||
"""Normalize thought tags: unify naming and fix spacing"""
|
||||
# 1. Standardize start tag: <think>, <thinking> -> <thought>
|
||||
content = self._PATTERNS["thought_start"].sub("<thought>", content)
|
||||
# 2. Standardize end tag and ensure newlines: </think> -> </thought>\n\n
|
||||
return self._PATTERNS["thought_end"].sub("</thought>\n\n", content)
|
||||
|
||||
def _fix_code_blocks(self, content: str) -> str:
|
||||
"""Fix code block formatting (prefixes, suffixes, indentation)"""
|
||||
# Remove indentation before code blocks
|
||||
content = self._PATTERNS["code_block_indent"].sub(r"\1", content)
|
||||
# Ensure newline before ```
|
||||
content = self._PATTERNS["code_block_prefix"].sub(r"\n\1", content)
|
||||
# Ensure newline after ```lang
|
||||
content = self._PATTERNS["code_block_suffix"].sub(r"\1\n\2", content)
|
||||
return content
|
||||
|
||||
def _fix_latex_formulas(self, content: str) -> str:
|
||||
"""Normalize LaTeX formulas: \[ -> $$ (block), \( -> $ (inline)"""
|
||||
content = self._PATTERNS["latex_bracket_block"].sub(r"$$\1$$", content)
|
||||
content = self._PATTERNS["latex_paren_inline"].sub(r"$\1$", content)
|
||||
return content
|
||||
|
||||
def _fix_list_formatting(self, content: str) -> str:
|
||||
"""Fix missing newlines in lists (e.g., 'text1. item' -> 'text\\n1. item')"""
|
||||
return self._PATTERNS["list_item"].sub(r"\1\n\2", content)
|
||||
|
||||
def _fix_unclosed_code_blocks(self, content: str) -> str:
|
||||
"""Auto-close unclosed code blocks"""
|
||||
if content.count("```") % 2 != 0:
|
||||
content += "\n```"
|
||||
return content
|
||||
|
||||
def _fix_fullwidth_symbols_in_code(self, content: str) -> str:
|
||||
"""Convert full-width symbols to half-width inside code blocks"""
|
||||
FULLWIDTH_MAP = {
|
||||
",": ",",
|
||||
"。": ".",
|
||||
"(": "(",
|
||||
")": ")",
|
||||
"【": "[",
|
||||
"】": "]",
|
||||
";": ";",
|
||||
":": ":",
|
||||
"?": "?",
|
||||
"!": "!",
|
||||
'"': '"',
|
||||
'"': '"',
|
||||
""": "'", """: "'",
|
||||
}
|
||||
|
||||
parts = content.split("```")
|
||||
# Code block content is at odd indices: 1, 3, 5...
|
||||
for i in range(1, len(parts), 2):
|
||||
for full, half in FULLWIDTH_MAP.items():
|
||||
parts[i] = parts[i].replace(full, half)
|
||||
|
||||
return "```".join(parts)
|
||||
|
||||
def _fix_mermaid_syntax(self, content: str) -> str:
|
||||
"""Fix common Mermaid syntax errors while preserving node shapes"""
|
||||
|
||||
def replacer(match):
|
||||
# Group 1 is ID
|
||||
id_str = match.group(1)
|
||||
|
||||
# Find matching shape group
|
||||
# Groups start at index 2, each shape has 3 groups (Open, Content, Close)
|
||||
# We iterate to find the non-None one
|
||||
groups = match.groups()
|
||||
for i in range(1, len(groups), 3):
|
||||
if groups[i] is not None:
|
||||
open_char = groups[i]
|
||||
content = groups[i + 1]
|
||||
close_char = groups[i + 2]
|
||||
|
||||
# Escape quotes in content
|
||||
content = content.replace('"', '\\"')
|
||||
|
||||
return f'{id_str}{open_char}"{content}"{close_char}'
|
||||
|
||||
return match.group(0)
|
||||
|
||||
parts = content.split("```")
|
||||
for i in range(1, len(parts), 2):
|
||||
# Check if it's a mermaid block
|
||||
lang_line = parts[i].split("\n", 1)[0].strip().lower()
|
||||
if "mermaid" in lang_line:
|
||||
# Apply the comprehensive regex fix
|
||||
parts[i] = self._PATTERNS["mermaid_node"].sub(replacer, parts[i])
|
||||
|
||||
# Auto-close subgraphs
|
||||
subgraph_count = len(
|
||||
re.findall(r"\bsubgraph\b", parts[i], re.IGNORECASE)
|
||||
)
|
||||
end_count = len(re.findall(r"\bend\b", parts[i], re.IGNORECASE))
|
||||
|
||||
if subgraph_count > end_count:
|
||||
missing_ends = subgraph_count - end_count
|
||||
parts[i] = parts[i].rstrip() + ("\n end" * missing_ends) + "\n"
|
||||
|
||||
return "```".join(parts)
|
||||
|
||||
def _fix_headings(self, content: str) -> str:
|
||||
"""Fix missing space in headings: #Heading -> # Heading"""
|
||||
# We only fix if it's not inside a code block.
|
||||
# But splitting by code block is expensive.
|
||||
# Given headings usually don't appear inside code blocks without space in valid code (except comments),
|
||||
# we might risk false positives in comments like `#TODO`.
|
||||
# To be safe, let's split by code blocks.
|
||||
|
||||
parts = content.split("```")
|
||||
for i in range(0, len(parts), 2): # Even indices are markdown text
|
||||
parts[i] = self._PATTERNS["heading_space"].sub(r"\1 \2", parts[i])
|
||||
return "```".join(parts)
|
||||
|
||||
def _fix_tables(self, content: str) -> str:
|
||||
"""Fix tables missing closing pipe"""
|
||||
parts = content.split("```")
|
||||
for i in range(0, len(parts), 2):
|
||||
parts[i] = self._PATTERNS["table_pipe"].sub(r"\1|", parts[i])
|
||||
return "```".join(parts)
|
||||
|
||||
def _cleanup_xml_tags(self, content: str) -> str:
|
||||
"""Remove leftover XML tags"""
|
||||
return self._PATTERNS["xml_artifacts"].sub("", content)
|
||||
|
||||
|
||||
class Filter:
|
||||
class Valves(BaseModel):
|
||||
priority: int = Field(
|
||||
default=50,
|
||||
description="Priority level. Higher runs later (recommended to run after other filters).",
|
||||
)
|
||||
enable_escape_fix: bool = Field(
|
||||
default=True, description="Fix excessive escape characters (\\n, \\t, etc.)"
|
||||
)
|
||||
enable_thought_tag_fix: bool = Field(
|
||||
default=True, description="Normalize </thought> tags"
|
||||
)
|
||||
enable_code_block_fix: bool = Field(
|
||||
default=True,
|
||||
description="Fix code block formatting (indentation, newlines)",
|
||||
)
|
||||
enable_latex_fix: bool = Field(
|
||||
default=True, description="Normalize LaTeX formulas (\\[ -> $$, \\( -> $)"
|
||||
)
|
||||
enable_list_fix: bool = Field(
|
||||
default=False, description="Fix list item newlines (Experimental)"
|
||||
)
|
||||
enable_unclosed_block_fix: bool = Field(
|
||||
default=True, description="Auto-close unclosed code blocks"
|
||||
)
|
||||
enable_fullwidth_symbol_fix: bool = Field(
|
||||
default=False, description="Fix full-width symbols in code blocks"
|
||||
)
|
||||
enable_mermaid_fix: bool = Field(
|
||||
default=True,
|
||||
description="Fix common Mermaid syntax errors (e.g. unquoted labels)",
|
||||
)
|
||||
enable_heading_fix: bool = Field(
|
||||
default=True,
|
||||
description="Fix missing space in headings (#Header -> # Header)",
|
||||
)
|
||||
enable_table_fix: bool = Field(
|
||||
default=True, description="Fix missing closing pipe in tables"
|
||||
)
|
||||
enable_xml_tag_cleanup: bool = Field(
|
||||
default=True, description="Cleanup leftover XML tags"
|
||||
)
|
||||
show_status: bool = Field(
|
||||
default=True, description="Show status notification when fixes are applied"
|
||||
)
|
||||
show_debug_log: bool = Field(
|
||||
default=False, description="Print debug logs to browser console (F12)"
|
||||
)
|
||||
|
||||
def __init__(self):
|
||||
self.valves = self.Valves()
|
||||
|
||||
def _contains_html(self, content: str) -> bool:
|
||||
"""Check if content contains HTML tags (to avoid breaking HTML output)"""
|
||||
pattern = r"<\s*/?\s*(?:html|head|body|div|span|p|br|hr|ul|ol|li|table|thead|tbody|tfoot|tr|td|th|img|a|b|i|strong|em|code|pre|blockquote|h[1-6]|script|style|form|input|button|label|select|option|iframe|link|meta|title)\b"
|
||||
return bool(re.search(pattern, content, re.IGNORECASE))
|
||||
|
||||
async def _emit_status(self, __event_emitter__, applied_fixes: List[str]):
|
||||
"""Emit status notification"""
|
||||
if not self.valves.show_status or not applied_fixes:
|
||||
return
|
||||
|
||||
description = "✓ Markdown Normalized"
|
||||
if applied_fixes:
|
||||
description += f": {', '.join(applied_fixes)}"
|
||||
|
||||
try:
|
||||
await __event_emitter__(
|
||||
{
|
||||
"type": "status",
|
||||
"data": {
|
||||
"description": description,
|
||||
"done": True,
|
||||
},
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error emitting status: {e}")
|
||||
|
||||
async def _emit_debug_log(
|
||||
self, __event_call__, applied_fixes: List[str], original: str, normalized: str
|
||||
):
|
||||
"""Emit debug log to browser console via JS execution"""
|
||||
if not self.valves.show_debug_log or not __event_call__:
|
||||
return
|
||||
|
||||
try:
|
||||
# Prepare data for JS
|
||||
log_data = {
|
||||
"fixes": applied_fixes,
|
||||
"original": original,
|
||||
"normalized": normalized,
|
||||
}
|
||||
|
||||
# Construct JS code
|
||||
js_code = f"""
|
||||
(async function() {{
|
||||
console.group("🛠️ Markdown Normalizer Debug");
|
||||
console.log("Applied Fixes:", {json.dumps(applied_fixes, ensure_ascii=False)});
|
||||
console.log("Original Content:", {json.dumps(original, ensure_ascii=False)});
|
||||
console.log("Normalized Content:", {json.dumps(normalized, ensure_ascii=False)});
|
||||
console.groupEnd();
|
||||
}})();
|
||||
"""
|
||||
|
||||
await __event_call__(
|
||||
{
|
||||
"type": "execute",
|
||||
"data": {"code": js_code},
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error emitting debug log: {e}")
|
||||
|
||||
async def outlet(
|
||||
self,
|
||||
body: dict,
|
||||
__user__: Optional[dict] = None,
|
||||
__event_emitter__=None,
|
||||
__event_call__=None,
|
||||
__metadata__: Optional[dict] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Process the response body to normalize Markdown content.
|
||||
"""
|
||||
if "messages" in body and body["messages"]:
|
||||
last = body["messages"][-1]
|
||||
content = last.get("content", "") or ""
|
||||
|
||||
if last.get("role") == "assistant" and isinstance(content, str):
|
||||
# Skip if content looks like HTML to avoid breaking it
|
||||
if self._contains_html(content):
|
||||
return body
|
||||
|
||||
# Configure normalizer based on valves
|
||||
config = NormalizerConfig(
|
||||
enable_escape_fix=self.valves.enable_escape_fix,
|
||||
enable_thought_tag_fix=self.valves.enable_thought_tag_fix,
|
||||
enable_code_block_fix=self.valves.enable_code_block_fix,
|
||||
enable_latex_fix=self.valves.enable_latex_fix,
|
||||
enable_list_fix=self.valves.enable_list_fix,
|
||||
enable_unclosed_block_fix=self.valves.enable_unclosed_block_fix,
|
||||
enable_fullwidth_symbol_fix=self.valves.enable_fullwidth_symbol_fix,
|
||||
enable_mermaid_fix=self.valves.enable_mermaid_fix,
|
||||
enable_heading_fix=self.valves.enable_heading_fix,
|
||||
enable_table_fix=self.valves.enable_table_fix,
|
||||
enable_xml_tag_cleanup=self.valves.enable_xml_tag_cleanup,
|
||||
)
|
||||
|
||||
normalizer = ContentNormalizer(config)
|
||||
|
||||
# Execute normalization
|
||||
new_content = normalizer.normalize(content)
|
||||
|
||||
# Update content if changed
|
||||
if new_content != content:
|
||||
last["content"] = new_content
|
||||
|
||||
# Emit status if enabled
|
||||
if __event_emitter__:
|
||||
await self._emit_status(
|
||||
__event_emitter__, normalizer.applied_fixes
|
||||
)
|
||||
await self._emit_debug_log(
|
||||
__event_call__,
|
||||
normalizer.applied_fixes,
|
||||
content,
|
||||
new_content,
|
||||
)
|
||||
|
||||
return body
|
||||
544
plugins/filters/markdown_normalizer/markdown_normalizer_cn.py
Normal file
544
plugins/filters/markdown_normalizer/markdown_normalizer_cn.py
Normal file
@@ -0,0 +1,544 @@
|
||||
"""
|
||||
title: Markdown 格式修复器 (Markdown Normalizer)
|
||||
author: Fu-Jie
|
||||
author_url: https://github.com/Fu-Jie
|
||||
funding_url: https://github.com/Fu-Jie/awesome-openwebui
|
||||
version: 1.0.0
|
||||
description: 生产级内容规范化过滤器,修复 LLM 输出中常见的 Markdown 格式问题,如损坏的代码块、LaTeX 公式、Mermaid 图表和列表格式。
|
||||
"""
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Optional, List, Callable
|
||||
import re
|
||||
import logging
|
||||
import asyncio
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
# Configure logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NormalizerConfig:
|
||||
"""配置类,用于启用/禁用特定的规范化规则"""
|
||||
|
||||
enable_escape_fix: bool = True # 修复过度的转义字符
|
||||
enable_thought_tag_fix: bool = True # 规范化思维链标签
|
||||
enable_code_block_fix: bool = True # 修复代码块格式
|
||||
enable_latex_fix: bool = True # 修复 LaTeX 公式格式
|
||||
enable_list_fix: bool = False # 修复列表项换行 (默认关闭,因为可能过于激进)
|
||||
enable_unclosed_block_fix: bool = True # 自动闭合未闭合的代码块
|
||||
enable_fullwidth_symbol_fix: bool = False # 修复代码块中的全角符号
|
||||
enable_mermaid_fix: bool = True # 修复常见的 Mermaid 语法错误
|
||||
enable_heading_fix: bool = True # 修复标题中缺失的空格 (#Header -> # Header)
|
||||
enable_table_fix: bool = True # 修复表格中缺失的闭合管道符
|
||||
enable_xml_tag_cleanup: bool = True # 清理残留的 XML 标签
|
||||
|
||||
# 自定义清理函数 (用于高级扩展)
|
||||
custom_cleaners: List[Callable[[str], str]] = field(default_factory=list)
|
||||
|
||||
|
||||
class ContentNormalizer:
|
||||
"""LLM Output Content Normalizer - Production Grade Implementation"""
|
||||
|
||||
# --- 1. Pre-compiled Regex Patterns (Performance Optimization) ---
|
||||
_PATTERNS = {
|
||||
# Code block prefix: if ``` is not at start of line or file
|
||||
"code_block_prefix": re.compile(r"(?<!^)(?<!\n)(```)", re.MULTILINE),
|
||||
# Code block suffix: ```lang followed by non-whitespace (no newline)
|
||||
"code_block_suffix": re.compile(r"(```[\w\+\-\.]*)[ \t]+([^\n\r])"),
|
||||
# Code block indent: whitespace at start of line + ```
|
||||
"code_block_indent": re.compile(r"^[ \t]+(```)", re.MULTILINE),
|
||||
# Thought tag: </thought> followed by optional whitespace/newlines
|
||||
"thought_end": re.compile(
|
||||
r"</(thought|think|thinking)>[ \t]*\n*", re.IGNORECASE
|
||||
),
|
||||
"thought_start": re.compile(r"<(thought|think|thinking)>", re.IGNORECASE),
|
||||
# LaTeX block: \[ ... \]
|
||||
"latex_bracket_block": re.compile(r"\\\[(.+?)\\\]", re.DOTALL),
|
||||
# LaTeX inline: \( ... \)
|
||||
"latex_paren_inline": re.compile(r"\\\((.+?)\\\)"),
|
||||
# List item: non-newline + digit + dot + space
|
||||
"list_item": re.compile(r"([^\n])(\d+\. )"),
|
||||
# XML artifacts (e.g. Claude's)
|
||||
"xml_artifacts": re.compile(
|
||||
r"</?(?:antArtifact|antThinking|artifact)[^>]*>", re.IGNORECASE
|
||||
),
|
||||
# Mermaid: 匹配各种形状的节点并为未加引号的标签添加引号
|
||||
# 修复"反向优化"问题:必须精确匹配各种形状的定界符,避免破坏形状结构
|
||||
# 优先级:长定界符优先匹配
|
||||
"mermaid_node": re.compile(
|
||||
r"(\w+)\s*(?:"
|
||||
r"(\(\(\()(?![\"])(.*?)(?<![\"])(\)\)\))|" # (((...))) Double Circle
|
||||
r"(\(\()(?![\"])(.*?)(?<![\"])(\)\))|" # ((...)) Circle
|
||||
r"(\(\[)(?![\"])(.*?)(?<![\"])(\]\))|" # ([...]) Stadium
|
||||
r"(\[\()(?![\"])(.*?)(?<![\"])(\)\])|" # [(...)] Cylinder
|
||||
r"(\[\[)(?![\"])(.*?)(?<![\"])(\]\])|" # [[...]] Subroutine
|
||||
r"(\{\{)(?![\"])(.*?)(?<![\"])(\}\})|" # {{...}} Hexagon
|
||||
r"(\[/)(?![\"])(.*?)(?<![\"])(/\])|" # [/.../] Parallelogram
|
||||
r"(\[\\)(?![\"])(.*?)(?<![\"])(\\\])|" # [\...\] Parallelogram Alt
|
||||
r"(\[/)(?![\"])(.*?)(?<![\"])(\\\])|" # [/...\] Trapezoid
|
||||
r"(\[\\)(?![\"])(.*?)(?<![\"])(/\])|" # [\.../] Trapezoid Alt
|
||||
r"(\()(?![\"])(.*?)(?<![\"])(\))|" # (...) Round
|
||||
r"(\[)(?![\"])(.*?)(?<![\"])(\])|" # [...] Square
|
||||
r"(\{)(?![\"])(.*?)(?<![\"])(\})|" # {...} Rhombus
|
||||
r"(>)(?![\"])(.*?)(?<![\"])(\])" # >...] Asymmetric
|
||||
r")"
|
||||
),
|
||||
# Heading: #Heading -> # Heading
|
||||
"heading_space": re.compile(r"^(#+)([^ \n#])", re.MULTILINE),
|
||||
# Table: | col1 | col2 -> | col1 | col2 |
|
||||
"table_pipe": re.compile(r"^(\|.*[^|\n])$", re.MULTILINE),
|
||||
}
|
||||
|
||||
def __init__(self, config: Optional[NormalizerConfig] = None):
|
||||
self.config = config or NormalizerConfig()
|
||||
self.applied_fixes = []
|
||||
|
||||
def normalize(self, content: str) -> str:
|
||||
"""Main entry point: apply all normalization rules in order"""
|
||||
self.applied_fixes = []
|
||||
if not content:
|
||||
return content
|
||||
|
||||
original_content = content # Keep a copy for logging
|
||||
|
||||
try:
|
||||
# 1. Escape character fix (Must be first)
|
||||
if self.config.enable_escape_fix:
|
||||
original = content
|
||||
content = self._fix_escape_characters(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Fix Escape Chars")
|
||||
|
||||
# 2. Thought tag normalization
|
||||
if self.config.enable_thought_tag_fix:
|
||||
original = content
|
||||
content = self._fix_thought_tags(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Normalize Thought Tags")
|
||||
|
||||
# 3. Code block formatting fix
|
||||
if self.config.enable_code_block_fix:
|
||||
original = content
|
||||
content = self._fix_code_blocks(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Fix Code Blocks")
|
||||
|
||||
# 4. LaTeX formula normalization
|
||||
if self.config.enable_latex_fix:
|
||||
original = content
|
||||
content = self._fix_latex_formulas(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Normalize LaTeX")
|
||||
|
||||
# 5. List formatting fix
|
||||
if self.config.enable_list_fix:
|
||||
original = content
|
||||
content = self._fix_list_formatting(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Fix List Format")
|
||||
|
||||
# 6. Unclosed code block fix
|
||||
if self.config.enable_unclosed_block_fix:
|
||||
original = content
|
||||
content = self._fix_unclosed_code_blocks(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Close Code Blocks")
|
||||
|
||||
# 7. Full-width symbol fix (in code blocks only)
|
||||
if self.config.enable_fullwidth_symbol_fix:
|
||||
original = content
|
||||
content = self._fix_fullwidth_symbols_in_code(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Fix Full-width Symbols")
|
||||
|
||||
# 8. Mermaid syntax fix
|
||||
if self.config.enable_mermaid_fix:
|
||||
original = content
|
||||
content = self._fix_mermaid_syntax(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Fix Mermaid Syntax")
|
||||
|
||||
# 9. Heading fix
|
||||
if self.config.enable_heading_fix:
|
||||
original = content
|
||||
content = self._fix_headings(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Fix Headings")
|
||||
|
||||
# 10. Table fix
|
||||
if self.config.enable_table_fix:
|
||||
original = content
|
||||
content = self._fix_tables(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Fix Tables")
|
||||
|
||||
# 11. XML tag cleanup
|
||||
if self.config.enable_xml_tag_cleanup:
|
||||
original = content
|
||||
content = self._cleanup_xml_tags(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Cleanup XML Tags")
|
||||
|
||||
# 9. Custom cleaners
|
||||
for cleaner in self.config.custom_cleaners:
|
||||
original = content
|
||||
content = cleaner(content)
|
||||
if content != original:
|
||||
self.applied_fixes.append("Custom Cleaner")
|
||||
|
||||
if self.applied_fixes:
|
||||
print(f"[Markdown Normalizer] Applied fixes: {self.applied_fixes}")
|
||||
print(
|
||||
f"[Markdown Normalizer] --- Original Content ---\n{original_content}\n------------------------"
|
||||
)
|
||||
print(
|
||||
f"[Markdown Normalizer] --- Normalized Content ---\n{content}\n--------------------------"
|
||||
)
|
||||
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
# Production safeguard: return original content on error
|
||||
logger.error(f"Content normalization failed: {e}", exc_info=True)
|
||||
return content
|
||||
|
||||
def _fix_escape_characters(self, content: str) -> str:
|
||||
"""Fix excessive escape characters"""
|
||||
content = content.replace("\\r\\n", "\n")
|
||||
content = content.replace("\\n", "\n")
|
||||
content = content.replace("\\t", "\t")
|
||||
content = content.replace("\\\\", "\\")
|
||||
return content
|
||||
|
||||
def _fix_thought_tags(self, content: str) -> str:
|
||||
"""Normalize thought tags: unify naming and fix spacing"""
|
||||
# 1. Standardize start tag: <think>, <thinking> -> <thought>
|
||||
content = self._PATTERNS["thought_start"].sub("<thought>", content)
|
||||
# 2. Standardize end tag and ensure newlines: </think> -> </thought>\n\n
|
||||
return self._PATTERNS["thought_end"].sub("</thought>\n\n", content)
|
||||
|
||||
def _fix_code_blocks(self, content: str) -> str:
|
||||
"""Fix code block formatting (prefixes, suffixes, indentation)"""
|
||||
# Remove indentation before code blocks
|
||||
content = self._PATTERNS["code_block_indent"].sub(r"\1", content)
|
||||
# Ensure newline before ```
|
||||
content = self._PATTERNS["code_block_prefix"].sub(r"\n\1", content)
|
||||
# Ensure newline after ```lang
|
||||
content = self._PATTERNS["code_block_suffix"].sub(r"\1\n\2", content)
|
||||
return content
|
||||
|
||||
def _fix_latex_formulas(self, content: str) -> str:
|
||||
"""Normalize LaTeX formulas: \[ -> $$ (block), \( -> $ (inline)"""
|
||||
content = self._PATTERNS["latex_bracket_block"].sub(r"$$\1$$", content)
|
||||
content = self._PATTERNS["latex_paren_inline"].sub(r"$\1$", content)
|
||||
return content
|
||||
|
||||
def _fix_list_formatting(self, content: str) -> str:
|
||||
"""Fix missing newlines in lists (e.g., 'text1. item' -> 'text\\n1. item')"""
|
||||
return self._PATTERNS["list_item"].sub(r"\1\n\2", content)
|
||||
|
||||
def _fix_unclosed_code_blocks(self, content: str) -> str:
|
||||
"""Auto-close unclosed code blocks"""
|
||||
if content.count("```") % 2 != 0:
|
||||
content += "\n```"
|
||||
return content
|
||||
|
||||
def _fix_fullwidth_symbols_in_code(self, content: str) -> str:
|
||||
"""Convert full-width symbols to half-width inside code blocks"""
|
||||
FULLWIDTH_MAP = {
|
||||
",": ",",
|
||||
"。": ".",
|
||||
"(": "(",
|
||||
")": ")",
|
||||
"【": "[",
|
||||
"】": "]",
|
||||
";": ";",
|
||||
":": ":",
|
||||
"?": "?",
|
||||
"!": "!",
|
||||
'"': '"',
|
||||
'"': '"',
|
||||
""": "'", """: "'",
|
||||
}
|
||||
|
||||
parts = content.split("```")
|
||||
# Code block content is at odd indices: 1, 3, 5...
|
||||
for i in range(1, len(parts), 2):
|
||||
for full, half in FULLWIDTH_MAP.items():
|
||||
parts[i] = parts[i].replace(full, half)
|
||||
|
||||
return "```".join(parts)
|
||||
|
||||
def _fix_mermaid_syntax(self, content: str) -> str:
|
||||
"""修复常见的 Mermaid 语法错误,同时保留节点形状"""
|
||||
|
||||
def replacer(match):
|
||||
# Group 1 是 ID
|
||||
id_str = match.group(1)
|
||||
|
||||
# 查找匹配的形状组
|
||||
# 组从索引 2 开始,每个形状有 3 个组 (Open, Content, Close)
|
||||
# 我们遍历找到非 None 的那一组
|
||||
groups = match.groups()
|
||||
for i in range(1, len(groups), 3):
|
||||
if groups[i] is not None:
|
||||
open_char = groups[i]
|
||||
content = groups[i + 1]
|
||||
close_char = groups[i + 2]
|
||||
|
||||
# 如果内容包含引号,进行转义
|
||||
content = content.replace('"', '\\"')
|
||||
|
||||
return f'{id_str}{open_char}"{content}"{close_char}'
|
||||
|
||||
return match.group(0)
|
||||
|
||||
parts = content.split("```")
|
||||
for i in range(1, len(parts), 2):
|
||||
# Check if it's a mermaid block
|
||||
lang_line = parts[i].split("\n", 1)[0].strip().lower()
|
||||
if "mermaid" in lang_line:
|
||||
# Apply the comprehensive regex fix
|
||||
parts[i] = self._PATTERNS["mermaid_node"].sub(replacer, parts[i])
|
||||
|
||||
# Auto-close subgraphs
|
||||
# Count 'subgraph' and 'end' (case-insensitive)
|
||||
# We use a simple regex to avoid matching words inside labels (though labels are now quoted, so it's safer)
|
||||
# But for simplicity and speed, we just count occurrences in the whole block.
|
||||
# A more robust way would be to strip quoted strings first, but that's expensive.
|
||||
# Given we just quoted everything, let's try to count keywords outside quotes?
|
||||
# Actually, since we just normalized nodes, most text is in quotes.
|
||||
# Let's just do a simple count. It's a heuristic fix.
|
||||
subgraph_count = len(
|
||||
re.findall(r"\bsubgraph\b", parts[i], re.IGNORECASE)
|
||||
)
|
||||
end_count = len(re.findall(r"\bend\b", parts[i], re.IGNORECASE))
|
||||
|
||||
if subgraph_count > end_count:
|
||||
missing_ends = subgraph_count - end_count
|
||||
parts[i] = parts[i].rstrip() + ("\n end" * missing_ends) + "\n"
|
||||
|
||||
return "```".join(parts)
|
||||
|
||||
def _fix_headings(self, content: str) -> str:
|
||||
"""Fix missing space in headings: #Heading -> # Heading"""
|
||||
# We only fix if it's not inside a code block.
|
||||
# But splitting by code block is expensive.
|
||||
# Given headings usually don't appear inside code blocks without space in valid code (except comments),
|
||||
# we might risk false positives in comments like `#TODO`.
|
||||
# To be safe, let's split by code blocks.
|
||||
|
||||
parts = content.split("```")
|
||||
for i in range(0, len(parts), 2): # Even indices are markdown text
|
||||
parts[i] = self._PATTERNS["heading_space"].sub(r"\1 \2", parts[i])
|
||||
return "```".join(parts)
|
||||
|
||||
def _fix_tables(self, content: str) -> str:
|
||||
"""Fix tables missing closing pipe"""
|
||||
parts = content.split("```")
|
||||
for i in range(0, len(parts), 2):
|
||||
parts[i] = self._PATTERNS["table_pipe"].sub(r"\1|", parts[i])
|
||||
return "```".join(parts)
|
||||
|
||||
def _cleanup_xml_tags(self, content: str) -> str:
|
||||
"""Remove leftover XML tags"""
|
||||
return self._PATTERNS["xml_artifacts"].sub("", content)
|
||||
|
||||
|
||||
class Filter:
|
||||
class Valves(BaseModel):
|
||||
priority: int = Field(
|
||||
default=50,
|
||||
description="优先级。数值越高运行越晚 (建议在其他过滤器之后运行)。",
|
||||
)
|
||||
enable_escape_fix: bool = Field(
|
||||
default=True, description="修复过度的转义字符 (\\n, \\t 等)"
|
||||
)
|
||||
enable_thought_tag_fix: bool = Field(
|
||||
default=True, description="规范化思维链标签 (<think> -> <thought>)"
|
||||
)
|
||||
enable_code_block_fix: bool = Field(
|
||||
default=True,
|
||||
description="修复代码块格式 (缩进、换行)",
|
||||
)
|
||||
enable_latex_fix: bool = Field(
|
||||
default=True, description="规范化 LaTeX 公式 (\\[ -> $$, \\( -> $)"
|
||||
)
|
||||
enable_list_fix: bool = Field(
|
||||
default=False, description="修复列表项换行 (实验性)"
|
||||
)
|
||||
enable_unclosed_block_fix: bool = Field(
|
||||
default=True, description="自动闭合未闭合的代码块"
|
||||
)
|
||||
enable_fullwidth_symbol_fix: bool = Field(
|
||||
default=False, description="修复代码块中的全角符号"
|
||||
)
|
||||
enable_mermaid_fix: bool = Field(
|
||||
default=True,
|
||||
description="修复常见的 Mermaid 语法错误 (如未加引号的标签)",
|
||||
)
|
||||
enable_heading_fix: bool = Field(
|
||||
default=True,
|
||||
description="修复标题中缺失的空格 (#Header -> # Header)",
|
||||
)
|
||||
enable_table_fix: bool = Field(
|
||||
default=True, description="修复表格中缺失的闭合管道符"
|
||||
)
|
||||
enable_xml_tag_cleanup: bool = Field(
|
||||
default=True, description="清理残留的 XML 标签"
|
||||
)
|
||||
show_status: bool = Field(default=True, description="应用修复时显示状态通知")
|
||||
show_debug_log: bool = Field(
|
||||
default=False, description="在浏览器控制台打印调试日志 (F12)"
|
||||
)
|
||||
|
||||
def __init__(self):
|
||||
self.valves = self.Valves()
|
||||
|
||||
def _contains_html(self, content: str) -> bool:
|
||||
"""Check if content contains HTML tags (to avoid breaking HTML output)"""
|
||||
pattern = r"<\s*/?\s*(?:html|head|body|div|span|p|br|hr|ul|ol|li|table|thead|tbody|tfoot|tr|td|th|img|a|b|i|strong|em|code|pre|blockquote|h[1-6]|script|style|form|input|button|label|select|option|iframe|link|meta|title)\b"
|
||||
return bool(re.search(pattern, content, re.IGNORECASE))
|
||||
|
||||
async def _emit_status(self, __event_emitter__, applied_fixes: List[str]):
|
||||
"""Emit status notification"""
|
||||
if not self.valves.show_status or not applied_fixes:
|
||||
return
|
||||
|
||||
description = "✓ Markdown 已修复"
|
||||
if applied_fixes:
|
||||
# Translate fix names for status display
|
||||
fix_map = {
|
||||
"Fix Escape Chars": "转义字符",
|
||||
"Normalize Thought Tags": "思维标签",
|
||||
"Fix Code Blocks": "代码块",
|
||||
"Normalize LaTeX": "LaTeX公式",
|
||||
"Fix List Format": "列表格式",
|
||||
"Close Code Blocks": "闭合代码块",
|
||||
"Fix Full-width Symbols": "全角符号",
|
||||
"Fix Mermaid Syntax": "Mermaid语法",
|
||||
"Fix Headings": "标题格式",
|
||||
"Fix Tables": "表格格式",
|
||||
"Cleanup XML Tags": "XML清理",
|
||||
"Custom Cleaner": "自定义清理",
|
||||
}
|
||||
translated_fixes = [fix_map.get(fix, fix) for fix in applied_fixes]
|
||||
description += f": {', '.join(translated_fixes)}"
|
||||
|
||||
try:
|
||||
await __event_emitter__(
|
||||
{
|
||||
"type": "status",
|
||||
"data": {
|
||||
"description": description,
|
||||
"done": True,
|
||||
},
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error emitting status: {e}")
|
||||
|
||||
async def _emit_debug_log(
|
||||
self,
|
||||
__event_emitter__,
|
||||
applied_fixes: List[str],
|
||||
original: str,
|
||||
normalized: str,
|
||||
):
|
||||
"""Emit debug log to browser console via JS execution"""
|
||||
|
||||
async def _emit_debug_log(
|
||||
self, __event_call__, applied_fixes: List[str], original: str, normalized: str
|
||||
):
|
||||
"""Emit debug log to browser console via JS execution"""
|
||||
if not self.valves.show_debug_log or not __event_call__:
|
||||
return
|
||||
|
||||
try:
|
||||
# Prepare data for JS
|
||||
log_data = {
|
||||
"fixes": applied_fixes,
|
||||
"original": original,
|
||||
"normalized": normalized,
|
||||
}
|
||||
|
||||
# Construct JS code
|
||||
js_code = f"""
|
||||
(async function() {{
|
||||
console.group("🛠️ Markdown Normalizer Debug");
|
||||
console.log("Applied Fixes:", {json.dumps(applied_fixes, ensure_ascii=False)});
|
||||
console.log("Original Content:", {json.dumps(original, ensure_ascii=False)});
|
||||
console.log("Normalized Content:", {json.dumps(normalized, ensure_ascii=False)});
|
||||
console.groupEnd();
|
||||
}})();
|
||||
"""
|
||||
await __event_call__(
|
||||
{
|
||||
"type": "execute",
|
||||
"data": {"code": js_code},
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error emitting debug log: {e}")
|
||||
|
||||
async def outlet(
|
||||
self,
|
||||
body: dict,
|
||||
__user__: Optional[dict] = None,
|
||||
__event_emitter__=None,
|
||||
__event_call__=None,
|
||||
__metadata__: Optional[dict] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Process the response body to normalize Markdown content.
|
||||
"""
|
||||
if "messages" in body and body["messages"]:
|
||||
last = body["messages"][-1]
|
||||
content = last.get("content", "") or ""
|
||||
|
||||
if last.get("role") == "assistant" and isinstance(content, str):
|
||||
# Skip if content looks like HTML to avoid breaking it
|
||||
if self._contains_html(content):
|
||||
return body
|
||||
|
||||
# Configure normalizer based on valves
|
||||
config = NormalizerConfig(
|
||||
enable_escape_fix=self.valves.enable_escape_fix,
|
||||
enable_thought_tag_fix=self.valves.enable_thought_tag_fix,
|
||||
enable_code_block_fix=self.valves.enable_code_block_fix,
|
||||
enable_latex_fix=self.valves.enable_latex_fix,
|
||||
enable_list_fix=self.valves.enable_list_fix,
|
||||
enable_unclosed_block_fix=self.valves.enable_unclosed_block_fix,
|
||||
enable_fullwidth_symbol_fix=self.valves.enable_fullwidth_symbol_fix,
|
||||
enable_mermaid_fix=self.valves.enable_mermaid_fix,
|
||||
enable_heading_fix=self.valves.enable_heading_fix,
|
||||
enable_table_fix=self.valves.enable_table_fix,
|
||||
enable_xml_tag_cleanup=self.valves.enable_xml_tag_cleanup,
|
||||
)
|
||||
|
||||
normalizer = ContentNormalizer(config)
|
||||
|
||||
# Execute normalization
|
||||
new_content = normalizer.normalize(content)
|
||||
|
||||
# Update content if changed
|
||||
if new_content != content:
|
||||
last["content"] = new_content
|
||||
|
||||
# Emit status if enabled
|
||||
if __event_emitter__:
|
||||
await self._emit_status(
|
||||
__event_emitter__, normalizer.applied_fixes
|
||||
)
|
||||
await self._emit_debug_log(
|
||||
__event_call__,
|
||||
normalizer.applied_fixes,
|
||||
content,
|
||||
new_content,
|
||||
)
|
||||
|
||||
return body
|
||||
191
plugins/filters/markdown_normalizer/test_markdown_normalizer.py
Normal file
191
plugins/filters/markdown_normalizer/test_markdown_normalizer.py
Normal file
@@ -0,0 +1,191 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add the current directory to sys.path to import the module
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
sys.path.append(current_dir)
|
||||
|
||||
from markdown_normalizer import ContentNormalizer, NormalizerConfig
|
||||
|
||||
|
||||
class TestMarkdownNormalizer(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.config = NormalizerConfig(
|
||||
enable_escape_fix=True,
|
||||
enable_thought_tag_fix=True,
|
||||
enable_code_block_fix=True,
|
||||
enable_latex_fix=True,
|
||||
enable_list_fix=True,
|
||||
enable_unclosed_block_fix=True,
|
||||
enable_fullwidth_symbol_fix=True,
|
||||
enable_mermaid_fix=True,
|
||||
enable_xml_tag_cleanup=True,
|
||||
)
|
||||
self.normalizer = ContentNormalizer(self.config)
|
||||
|
||||
def test_escape_fix(self):
|
||||
input_text = "Line 1\\nLine 2\\tTabbed"
|
||||
expected = "Line 1\nLine 2\tTabbed"
|
||||
self.assertEqual(self.normalizer.normalize(input_text), expected)
|
||||
|
||||
def test_thought_tag_fix(self):
|
||||
# Case 1: Standard tag spacing
|
||||
input_text = "Thinking...</thought>Result"
|
||||
expected = "Thinking...</thought>\n\nResult"
|
||||
self.assertEqual(self.normalizer.normalize(input_text), expected)
|
||||
|
||||
# Case 2: Tag standardization (<think> -> <thought>)
|
||||
input_text_deepseek = "<think>Deep thinking...</think>Result"
|
||||
expected_deepseek = "<thought>Deep thinking...</thought>\n\nResult"
|
||||
self.assertEqual(
|
||||
self.normalizer.normalize(input_text_deepseek), expected_deepseek
|
||||
)
|
||||
|
||||
def test_code_block_fix(self):
|
||||
# Case 1: Indentation
|
||||
self.assertEqual(self.normalizer._fix_code_blocks(" ```python"), "```python")
|
||||
|
||||
# Case 2: Prefix (newline before block)
|
||||
self.assertEqual(
|
||||
self.normalizer._fix_code_blocks("Text```python"), "Text\n```python"
|
||||
)
|
||||
|
||||
# Case 3: Suffix (newline after lang)
|
||||
self.assertEqual(
|
||||
self.normalizer._fix_code_blocks("```python print('hi')"),
|
||||
"```python\nprint('hi')",
|
||||
)
|
||||
|
||||
def test_latex_fix(self):
|
||||
input_text = "Block: \\[ x^2 \\] Inline: \\( E=mc^2 \\)"
|
||||
expected = "Block: $$ x^2 $$ Inline: $ E=mc^2 $"
|
||||
self.assertEqual(self.normalizer.normalize(input_text), expected)
|
||||
|
||||
def test_list_fix(self):
|
||||
input_text = "Item 1. First\nItem 2. Second" # This is fine
|
||||
input_text_bad = "Header1. Item 1"
|
||||
expected = "Header\n1. Item 1"
|
||||
self.assertEqual(self.normalizer.normalize(input_text_bad), expected)
|
||||
|
||||
def test_unclosed_code_block_fix(self):
|
||||
input_text = "```python\nprint('hello')"
|
||||
expected = "```python\nprint('hello')\n```"
|
||||
self.assertEqual(self.normalizer.normalize(input_text), expected)
|
||||
|
||||
def test_fullwidth_symbol_fix(self):
|
||||
input_text = "Outside:Fullwidth ```python\nprint('hello')```"
|
||||
expected = "Outside:Fullwidth \n```python\nprint('hello')\n```"
|
||||
|
||||
normalized = self.normalizer.normalize(input_text)
|
||||
self.assertIn("print('hello')", normalized)
|
||||
self.assertIn("Outside:Fullwidth", normalized)
|
||||
self.assertNotIn("(", normalized)
|
||||
self.assertNotIn(")", normalized)
|
||||
|
||||
def test_mermaid_fix(self):
|
||||
# Test Mermaid syntax fix for unquoted labels
|
||||
# Note: Regex-based fix handles mixed brackets well (e.g. [] inside ())
|
||||
# but cannot perfectly handle same-type nesting (e.g. {} inside {}) without a parser.
|
||||
input_text = """
|
||||
```mermaid
|
||||
graph TD
|
||||
A[Label with (parens)] --> B(Label with [brackets])
|
||||
C{Label with [brackets]}
|
||||
```
|
||||
"""
|
||||
expected_snippet = """
|
||||
```mermaid
|
||||
graph TD
|
||||
A["Label with (parens)"] --> B("Label with [brackets]")
|
||||
C{"Label with [brackets]"}
|
||||
```
|
||||
"""
|
||||
normalized = self.normalizer.normalize(input_text)
|
||||
|
||||
self.assertIn('A["Label with (parens)"]', normalized)
|
||||
self.assertIn('B("Label with [brackets]")', normalized)
|
||||
self.assertIn('C{"Label with [brackets]"}', normalized)
|
||||
|
||||
def test_mermaid_shapes_regression(self):
|
||||
# Regression test for "reverse optimization" where ((...)) was broken into ("(...)")
|
||||
input_text = """
|
||||
```mermaid
|
||||
graph TD
|
||||
Start((开始)) --> Input[[输入]]
|
||||
Input --> Verify{验证}
|
||||
Verify --> End(((结束)))
|
||||
```
|
||||
"""
|
||||
expected_snippet = """
|
||||
```mermaid
|
||||
graph TD
|
||||
Start(("开始")) --> Input[["输入"]]
|
||||
Input --> Verify{"验证"}
|
||||
Verify --> End((("结束")))
|
||||
```
|
||||
"""
|
||||
normalized = self.normalizer.normalize(input_text)
|
||||
self.assertIn('Start(("开始"))', normalized)
|
||||
self.assertIn('Input[["输入"]]', normalized)
|
||||
self.assertIn('Verify{"验证"}', normalized)
|
||||
self.assertIn('End((("结束")))', normalized)
|
||||
|
||||
def test_xml_cleanup(self):
|
||||
input_text = "Some text <antArtifact>hidden</antArtifact> visible"
|
||||
expected = "Some text hidden visible"
|
||||
self.assertEqual(self.normalizer.normalize(input_text), expected)
|
||||
|
||||
def test_heading_fix(self):
|
||||
input_text = "#Heading 1\n##Heading 2\n### Valid Heading"
|
||||
expected = "# Heading 1\n## Heading 2\n### Valid Heading"
|
||||
self.assertEqual(self.normalizer.normalize(input_text), expected)
|
||||
|
||||
def test_table_fix(self):
|
||||
input_text = "| Col 1 | Col 2\n| Val 1 | Val 2"
|
||||
expected = "| Col 1 | Col 2|\n| Val 1 | Val 2|"
|
||||
self.assertEqual(self.normalizer.normalize(input_text), expected)
|
||||
|
||||
def test_mermaid_subgraph_autoclose(self):
|
||||
"""Test auto-closing of Mermaid subgraphs"""
|
||||
# Case 1: Simple unclosed subgraph
|
||||
original = """
|
||||
```mermaid
|
||||
graph TD
|
||||
subgraph One
|
||||
A --> B
|
||||
```
|
||||
"""
|
||||
expected = """
|
||||
```mermaid
|
||||
graph TD
|
||||
subgraph One
|
||||
A --> B
|
||||
end
|
||||
```
|
||||
"""
|
||||
# Note: The normalizer might add quotes to A and B if they match the node pattern,
|
||||
# but here they are simple IDs. However, our regex is strict about shapes.
|
||||
# Simple IDs like A and B are NOT matched by our mermaid_node regex because it requires a shape delimiter.
|
||||
# So A and B remain A and B.
|
||||
|
||||
normalized = self.normalizer.normalize(original)
|
||||
# We need to be careful about whitespace in comparison
|
||||
self.assertIn("end", normalized)
|
||||
self.assertEqual(normalized.strip(), expected.strip())
|
||||
|
||||
# Case 2: Nested unclosed subgraphs
|
||||
original_nested = """
|
||||
```mermaid
|
||||
graph TD
|
||||
subgraph Outer
|
||||
subgraph Inner
|
||||
C --> D
|
||||
```
|
||||
"""
|
||||
normalized_nested = self.normalizer.normalize(original_nested)
|
||||
self.assertEqual(normalized_nested.count("end"), 2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user