feat(filters): upgrade markdown-normalizer to v1.2.7

- Fix Issue #49: resolve greedy regex matching in consecutive emphasis
- Add LaTeX formula protection to prevent corruption of \times, \nu, etc.
- Expand i18n support to 12 languages with strict alignment
- Fix NameError in Request import during testing
This commit is contained in:
fujie
2026-02-24 15:05:25 +08:00
parent 18ada2a177
commit 2da934dd92
16 changed files with 981 additions and 1009 deletions

View File

@@ -0,0 +1,31 @@
import re
def reproduce_bug():
# 模拟 Issue #49 中提到的受损逻辑
# 核心问题在于正则表达式过于贪婪,或者在多次迭代中错误地将两个加粗块中间的部分当作了“带空格的加粗内容”
text = "I **prefer** tea **to** coffee."
# 模拟一个不严谨的、容易跨块匹配的正则
# 它会匹配 ** 开始,中间任意字符,** 结束
buggy_pattern = re.compile(r"(\*\*)(.*?)(\*\*)")
def buggy_fix(content):
# 模拟插件中的 strip 逻辑:它想去掉加粗符号内部的空格
# 但由于正则匹配了 "**prefer** tea **", 这里的 m.group(2) 变成了 "prefer** tea "
return buggy_pattern.sub(lambda m: f"{m.group(1)}{m.group(2).strip()}{m.group(1)}", content)
# 第一次执行:处理了 "**prefer**" -> "**prefer**"
result_1 = buggy_fix(text)
# 第二次执行(模拟 while 循环或重复运行):
# 此时如果正则引擎从第一个加粗的结束符开始匹配到第二个加粗的起始符
# 指针位置: I **prefer**[匹配开始] tea [匹配结束]**to** coffee.
# 就会把 " tea " 两侧的 ** 当作一对,然后 strip() 掉空格
result_2 = buggy_fix(result_1)
print(f"Original: {text}")
print(f"Step 1: {result_1}")
print(f"Step 2: {result_2} (Bug Reproduced!)")
if __name__ == "__main__":
reproduce_bug()

View File

@@ -0,0 +1,28 @@
import re
def reproduce_bug_v2():
# 模拟更接近旧版实际代码的情况
# 旧版代码中循环多次处理,且正则可能在处理嵌套或连续块时出现偏移
text = "I **prefer** tea **to** coffee."
# 这是一个贪婪且不具备前瞻断言的正则
buggy_pattern = re.compile(r"(\*\*)( +)(.*?)( +)(\*\*)")
# 模拟那种“只要看到 ** 且中间有空格就想修”的逻辑
# 如果文本是 "I **prefer** tea **to**"
# 这里的空格出现在 "prefer**" 和 "**to" 之间
content = "I **prefer** tea **to** coffee."
# 错误的匹配尝试:将第一个块的结尾和第二个块的开头误认为是一对
# I **prefer** tea **to**
# ^^ ^^
# A B
# 正则误以为 A 是开始B 是结束
bug_result = re.sub(r"\*\*( +)(.*?)( +)\*\*", r"**\2**", content)
print(f"Input: {content}")
print(f"Output: {bug_result}")
if __name__ == "__main__":
reproduce_bug_v2()

View File

@@ -0,0 +1,44 @@
import sys
import os
# Add plugin dir to path
current_dir = os.path.dirname(os.path.abspath(__file__))
plugin_dir = os.path.abspath(os.path.join(current_dir, "..", "filters", "markdown_normalizer"))
sys.path.append(plugin_dir)
from markdown_normalizer import ContentNormalizer, NormalizerConfig
def test_latex_protection():
# Test case 1: The reported issue with \times
content_1 = r"Calculation: $C(33, 6) \times C(16, 1)$"
config = NormalizerConfig(enable_escape_fix=True)
normalizer = ContentNormalizer(config)
result_1 = normalizer.normalize(content_1)
print("--- Test 1: \times Protection ---")
print(f"Input: {content_1}")
print(f"Output: {result_1}")
if r"\times" in result_1:
print("✅ PASSED")
else:
print("❌ FAILED")
# Test case 2: Other potential collisions like \nu (newline) or \theta (tab?)
# Using raw strings carefully
content_2 = r"Formula: $\theta = \nu + \tau$"
result_2 = normalizer.normalize(content_2)
print("\n--- Test 2: \theta and \nu Protection ---")
print(f"Input: {content_2}")
print(f"Output: {result_2}")
if r"\theta" in result_2 and r"\nu" in result_2:
print("✅ PASSED")
else:
print("❌ FAILED")
if __name__ == "__main__":
test_latex_protection()

View File

@@ -0,0 +1,42 @@
import re
def verify_fix_v126():
# 1. 准备触发 Bug 的测试文本
test_cases = [
"I **prefer** tea **to** coffee.", # 标准 Issue #49 案例
"The **quick** brown **fox** jumps **over**.", # 多个加粗块
"** text ** and ** more **", # 需要修复的内部空格
"Calculations: 2 * 3 * 4 = 24", # 不应被识别为强调的数学公式
]
# 2. 使用 v1.2.6 中的核心正则表达式 (移除了可能引起解析错误的中文注释)
# 模式: (?<!\*|_)(\*{1,3}|_{1,3})(?P<inner>(?:(?!\1)[^\n])*?)(\1)(?!\*|_)
pattern_str = r"(?<!\*|_)(\*{1,3}|_{1,3})(?P<inner>(?:(?!\1)[^\n])*?)(\1)(?!\*|_)"
FIX_REGEX = re.compile(pattern_str)
def fixed_normalizer(content):
def replacer(match):
symbol = match.group(1)
inner = match.group("inner")
stripped_inner = inner.strip()
# 只有当确实有空格需要修,且内部不是空的才修复
if stripped_inner != inner and stripped_inner:
return f"{symbol}{stripped_inner}{symbol}"
return match.group(0)
# 模拟插件循环处理
for _ in range(2):
content = FIX_REGEX.sub(replacer, content)
return content
print("--- v1.2.6 Fix Verification ---")
for text in test_cases:
result = fixed_normalizer(text)
print(f"Input: {text}")
print(f"Output: {result}")
print("-" * 30)
if __name__ == "__main__":
verify_fix_v126()

View File

@@ -1,12 +1,22 @@
# Markdown Normalizer Filter
**Author:** [Fu-Jie](https://github.com/Fu-Jie/openwebui-extensions) | **Version:** 1.2.4 | **Project:** [OpenWebUI Extensions](https://github.com/Fu-Jie/openwebui-extensions) | **License:** MIT
**Author:** [Fu-Jie](https://github.com/Fu-Jie/openwebui-extensions) | **Version:** 1.2.7 | **Project:** [OpenWebUI Extensions](https://github.com/Fu-Jie/openwebui-extensions) | **License:** MIT
A content normalizer filter for Open WebUI that fixes common Markdown formatting issues in LLM outputs. It ensures that code blocks, LaTeX formulas, Mermaid diagrams, and other Markdown elements are rendered correctly.
## 🔥 What's New in v1.2.4
## 🔥 What's New in v1.2.7
* **Documentation Sync**: Synchronized version numbers across all documentation and code files.
* **LaTeX Formula Protection**: Enhanced escape character cleaning to protect LaTeX commands like `\times`, `\nu`, and `\theta` from being corrupted.
* **Expanded i18n Support**: Now supports 12 languages with automatic detection and fallback.
* **Valves Optimization**: Optimized configuration descriptions to be English-only for better consistency.
* **Bug Fixes**:
* Resolved [Issue #49](https://github.com/Fu-Jie/openwebui-extensions/issues/49): Fixed a bug where consecutive bold parts on the same line caused spaces between them to be removed.
* Fixed a `NameError` in the plugin code that caused test collection failures.
## 🌐 Multilingual Support
Supports automatic interface and status switching for the following languages:
`English`, `简体中文`, `繁體中文 (香港)`, `繁體中文 (台灣)`, `한국어`, `日本語`, `Français`, `Deutsch`, `Español`, `Italiano`, `Tiếng Việt`, `Bahasa Indonesia`.
## ✨ Core Features

View File

@@ -1,12 +1,22 @@
# Markdown 格式化过滤器 (Markdown Normalizer)
**作者:** [Fu-Jie](https://github.com/Fu-Jie/openwebui-extensions) | **版本:** 1.2.4 | **项目:** [OpenWebUI Extensions](https://github.com/Fu-Jie/openwebui-extensions) | **许可证:** MIT
**作者:** [Fu-Jie](https://github.com/Fu-Jie/openwebui-extensions) | **版本:** 1.2.7 | **项目:** [OpenWebUI Extensions](https://github.com/Fu-Jie/openwebui-extensions) | **许可证:** MIT
这是一个用于 Open WebUI 的内容格式化过滤器,旨在修复 LLM 输出中常见的 Markdown 格式问题。它能确保代码块、LaTeX 公式、Mermaid 图表和其他 Markdown 元素被正确渲染。
## 🔥 最新更新 v1.2.4
## 🔥 最新更新 v1.2.7
* **文档更新**: 同步了所有文档和代码文件的版本号
* **LaTeX 公式保护**: 增强了转义字符清理逻辑,自动保护 `$ $``$$ $$` 内的 LaTeX 命令(如 `\times``\nu``\theta`),防止渲染失效
* **扩展国际化 (i18n) 支持**: 现已支持 12 种语言,具备自动探测与回退机制。
* **配置项优化**: 将 Valves 配置项的描述统一为英文,保持界面一致性。
* **修复 Bug**:
* 修复了 [Issue #49](https://github.com/Fu-Jie/openwebui-extensions/issues/49):解决了当同一行存在多个加粗部分时,由于正则匹配过于贪婪导致中间内容丢失空格的问题。
* 修复了插件代码中的 `NameError` 错误,确保测试脚本能正常运行。
## 🌐 多语言支持 (i18n)
支持以下语言的界面与状态自动切换:
`English`, `简体中文`, `繁體中文 (香港)`, `繁體中文 (台灣)`, `한국어`, `日本語`, `Français`, `Deutsch`, `Español`, `Italiano`, `Tiếng Việt`, `Bahasa Indonesia`
## ✨ 核心特性

View File

@@ -3,13 +3,14 @@ title: Markdown Normalizer
author: Fu-Jie
author_url: https://github.com/Fu-Jie/openwebui-extensions
funding_url: https://github.com/open-webui
version: 1.2.4
version: 1.2.7
openwebui_id: baaa8732-9348-40b7-8359-7e009660e23c
description: A content normalizer filter that fixes common Markdown formatting issues in LLM outputs, such as broken code blocks, LaTeX formulas, and list formatting.
description: A content normalizer filter that fixes common Markdown formatting issues in LLM outputs, such as broken code blocks, LaTeX formulas, and list formatting. Including LaTeX command protection.
"""
from pydantic import BaseModel, Field
from typing import Optional, List, Callable, Dict
from typing import Optional, List, Callable, Dict, Any
from fastapi import Request
import re
import logging
import asyncio
@@ -20,6 +21,217 @@ from dataclasses import dataclass, field
logger = logging.getLogger(__name__)
# i18n Translations
TRANSLATIONS = {
"en-US": {
"status_prefix": "✓ Markdown Normalized",
"fix_escape": "Escape Characters",
"fix_thought": "Thought Tags",
"fix_details": "Details Tags",
"fix_code": "Code Blocks",
"fix_latex": "LaTeX Formulas",
"fix_list": "List Format",
"fix_close_code": "Close Code Blocks",
"fix_fullwidth": "Full-width Symbols",
"fix_mermaid": "Mermaid Syntax",
"fix_heading": "Heading Format",
"fix_table": "Table Format",
"fix_xml": "XML Cleanup",
"fix_emphasis": "Emphasis Spacing",
"fix_custom": "Custom Cleaner",
},
"zh-CN": {
"status_prefix": "✓ Markdown 已修复",
"fix_escape": "转义字符",
"fix_thought": "思维标签",
"fix_details": "Details标签",
"fix_code": "代码块",
"fix_latex": "LaTeX公式",
"fix_list": "列表格式",
"fix_close_code": "闭合代码块",
"fix_fullwidth": "全角符号",
"fix_mermaid": "Mermaid语法",
"fix_heading": "标题格式",
"fix_table": "表格格式",
"fix_xml": "XML清理",
"fix_emphasis": "强调空格",
"fix_custom": "自定义清理",
},
"zh-HK": {
"status_prefix": "✓ Markdown 已修復",
"fix_escape": "轉義字元",
"fix_thought": "思維標籤",
"fix_details": "Details標籤",
"fix_code": "程式碼區塊",
"fix_latex": "LaTeX公式",
"fix_list": "列表格式",
"fix_close_code": "閉合程式碼區塊",
"fix_fullwidth": "全形符號",
"fix_mermaid": "Mermaid語法",
"fix_heading": "標題格式",
"fix_table": "表格格式",
"fix_xml": "XML清理",
"fix_emphasis": "強調空格",
"fix_custom": "自訂清理",
},
"zh-TW": {
"status_prefix": "✓ Markdown 已修復",
"fix_escape": "轉義字元",
"fix_thought": "思維標籤",
"fix_details": "Details標籤",
"fix_code": "程式碼區塊",
"fix_latex": "LaTeX公式",
"fix_list": "列表格式",
"fix_close_code": "閉合程式碼區塊",
"fix_fullwidth": "全形符號",
"fix_mermaid": "Mermaid語法",
"fix_heading": "標題格式",
"fix_table": "表格格式",
"fix_xml": "XML清理",
"fix_emphasis": "強調空格",
"fix_custom": "自訂清理",
},
"ko-KR": {
"status_prefix": "✓ Markdown 정규화됨",
"fix_escape": "이스케이프 문자",
"fix_thought": "생각 태그",
"fix_details": "Details 태그",
"fix_code": "코드 블록",
"fix_latex": "LaTeX 공식",
"fix_list": "목록 형식",
"fix_close_code": "코드 블록 닫기",
"fix_fullwidth": "전각 기호",
"fix_mermaid": "Mermaid 구문",
"fix_heading": "제목 형식",
"fix_table": "표 형식",
"fix_xml": "XML 정리",
"fix_emphasis": "강조 공백",
"fix_custom": "사용자 정의 정리",
},
"ja-JP": {
"status_prefix": "✓ Markdown 正規化済み",
"fix_escape": "エスケープ文字",
"fix_thought": "思考タグ",
"fix_details": "Detailsタグ",
"fix_code": "コードブロック",
"fix_latex": "LaTeX数式",
"fix_list": "リスト形式",
"fix_close_code": "コードブロックを閉じる",
"fix_fullwidth": "全角記号",
"fix_mermaid": "Mermaid構文",
"fix_heading": "見出し形式",
"fix_table": "表形式",
"fix_xml": "XMLクリーンアップ",
"fix_emphasis": "強調の空白",
"fix_custom": "カスタムクリーナー",
},
"fr-FR": {
"status_prefix": "✓ Markdown normalisé",
"fix_escape": "Caractères d'échappement",
"fix_thought": "Balises de pensée",
"fix_details": "Balises Details",
"fix_code": "Blocs de code",
"fix_latex": "Formules LaTeX",
"fix_list": "Format de liste",
"fix_close_code": "Fermer les blocs de code",
"fix_fullwidth": "Symboles pleine largeur",
"fix_mermaid": "Syntaxe Mermaid",
"fix_heading": "Format de titre",
"fix_table": "Format de tableau",
"fix_xml": "Nettoyage XML",
"fix_emphasis": "Espacement d'emphase",
"fix_custom": "Nettoyeur personnalisé",
},
"de-DE": {
"status_prefix": "✓ Markdown normalisiert",
"fix_escape": "Escape-Zeichen",
"fix_thought": "Denk-Tags",
"fix_details": "Details-Tags",
"fix_code": "Code-Blöcke",
"fix_latex": "LaTeX-Formeln",
"fix_list": "Listenformat",
"fix_close_code": "Code-Blöcke schließen",
"fix_fullwidth": "Vollbreite Symbole",
"fix_mermaid": "Mermaid-Syntax",
"fix_heading": "Überschriftenformat",
"fix_table": "Tabellenformat",
"fix_xml": "XML-Bereinigung",
"fix_emphasis": "Hervorhebungsabstände",
"fix_custom": "Benutzerdefinierter Reiniger",
},
"es-ES": {
"status_prefix": "✓ Markdown normalizado",
"fix_escape": "Caracteres de escape",
"fix_thought": "Etiquetas de pensamiento",
"fix_details": "Etiquetas de Details",
"fix_code": "Bloques de código",
"fix_latex": "Fórmulas LaTeX",
"fix_list": "Formato de lista",
"fix_close_code": "Cerrar bloques de código",
"fix_fullwidth": "Símbolos de ancho completo",
"fix_mermaid": "Sintaxis Mermaid",
"fix_heading": "Formato de encabezado",
"fix_table": "Formato de tabla",
"fix_xml": "Limpieza XML",
"fix_emphasis": "Espaciado de énfasis",
"fix_custom": "Limpiador personalizado",
},
"it-IT": {
"status_prefix": "✓ Markdown normalizzato",
"fix_escape": "Caratteri di escape",
"fix_thought": "Tag di pensiero",
"fix_details": "Tag Details",
"fix_code": "Blocchi di codice",
"fix_latex": "Formule LaTeX",
"fix_list": "Formato elenco",
"fix_close_code": "Chiudi blocchi di codice",
"fix_fullwidth": "Simboli a larghezza intera",
"fix_mermaid": "Sintassi Mermaid",
"fix_heading": "Formato intestazione",
"fix_table": "Formato tabella",
"fix_xml": "Pulizia XML",
"fix_emphasis": "Spaziatura enfasi",
"fix_custom": "Pulitore personalizzato",
},
"vi-VN": {
"status_prefix": "✓ Markdown đã chuẩn hóa",
"fix_escape": "Ký tự thoát",
"fix_thought": "Thẻ suy nghĩ",
"fix_details": "Thẻ Details",
"fix_code": "Khối mã",
"fix_latex": "Công thức LaTeX",
"fix_list": "Định dạng danh sách",
"fix_close_code": "Đóng khối mã",
"fix_fullwidth": "Ký tự toàn chiều rộng",
"fix_mermaid": "Cú pháp Mermaid",
"fix_heading": "Định dạng tiêu đề",
"fix_table": "Định dạng bảng",
"fix_xml": "Dọn dẹp XML",
"fix_emphasis": "Khoảng cách nhấn mạnh",
"fix_custom": "Trình dọn dẹp tùy chỉnh",
},
"id-ID": {
"status_prefix": "✓ Markdown dinormalisasi",
"fix_escape": "Karakter escape",
"fix_thought": "Tag pemikiran",
"fix_details": "Tag Details",
"fix_code": "Blok kode",
"fix_latex": "Formula LaTeX",
"fix_list": "Format daftar",
"fix_close_code": "Tutup blok kode",
"fix_fullwidth": "Simbol lebar penuh",
"fix_mermaid": "Sintaks Mermaid",
"fix_heading": "Format heading",
"fix_table": "Format tabel",
"fix_xml": "Pembersihan XML",
"fix_emphasis": "Spasi penekanan",
"fix_custom": "Pembersih kustom",
},
}
@dataclass
class NormalizerConfig:
"""Configuration class for enabling/disabling specific normalization rules"""
@@ -96,7 +308,7 @@ class ContentNormalizer:
r"(\[/)(?![\"])(.*?)(?<![\"])(/\])|" # [/.../] Parallelogram
r"(\[\\)(?![\"])(.*?)(?<![\"])(\\\])|" # [\...\] Parallelogram Alt
r"(\[/)(?![\"])(.*?)(?<![\"])(\\\])|" # [/...\] Trapezoid
r"(\[\\)(?![\"])(.*?)(?<![\"])(/\])|" # [\.../] Trapezoid Alt
r"(\[\\)(?![\"])(.*?)(?<![\"])(\/\])|" # [\.../] Trapezoid Alt
r"(\()(?![\"])([^)]*?)(?<![\"])(\))|" # (...) Round - Modified to be safer
r"(\[)(?![\"])(.*?)(?<![\"])(\])|" # [...] Square
r"(\{)(?![\"])(.*?)(?<![\"])(\})|" # {...} Rhombus
@@ -115,7 +327,7 @@ class ContentNormalizer:
# NOTE: We use [^\n] instead of . to prevent cross-line matching.
# Supports: * (italic), ** (bold), *** (bold+italic), _ (italic), __ (bold), ___ (bold+italic)
"emphasis_spacing": re.compile(
r"(?<!\*|_)(\*{1,3}|_{1,3})(?P<inner>[^\n]*?)(\1)(?!\*|_)"
r"(?<!\*|_)(\*{1,3}|_{1,3})(?P<inner>(?:(?!\1)[^\n])*?)(\1)(?!\*|_)"
),
}
@@ -247,30 +459,27 @@ class ContentNormalizer:
return content
def _fix_escape_characters(self, content: str) -> str:
"""Fix excessive escape characters
"""Fix excessive escape characters while protecting LaTeX and code blocks."""
If enable_escape_fix_in_code_blocks is False (default), this method will only
fix escape characters outside of code blocks to avoid breaking valid code
examples (e.g., JSON strings with \\n, regex patterns, etc.).
"""
if self.config.enable_escape_fix_in_code_blocks:
# Apply globally (original behavior)
content = content.replace("\\r\\n", "\n")
content = content.replace("\\n", "\n")
content = content.replace("\\t", "\t")
content = content.replace("\\\\", "\\")
return content
else:
# Apply only outside code blocks (safe mode)
parts = content.split("```")
for i in range(
0, len(parts), 2
): # Even indices are markdown text (not code)
parts[i] = parts[i].replace("\\r\\n", "\n")
parts[i] = parts[i].replace("\\n", "\n")
parts[i] = parts[i].replace("\\t", "\t")
parts[i] = parts[i].replace("\\\\", "\\")
return "```".join(parts)
def clean_text(text: str) -> str:
# Only fix \n and double backslashes, skip \t as it's dangerous for LaTeX (\times, \theta)
text = text.replace("\\r\\n", "\n")
text = text.replace("\\n", "\n")
text = text.replace("\\\\", "\\")
return text
# 1. Protect code blocks
parts = content.split("```")
for i in range(0, len(parts), 2): # Even indices are text
# 2. Protect LaTeX formulas within text
# Split by $ to find inline/block math
sub_parts = parts[i].split("$")
for j in range(0, len(sub_parts), 2): # Even indices are non-math text
sub_parts[j] = clean_text(sub_parts[j])
parts[i] = "$".join(sub_parts)
return "```".join(parts)
def _fix_thought_tags(self, content: str) -> str:
"""Normalize thought tags: unify naming and fix spacing"""
@@ -390,11 +599,6 @@ class ContentNormalizer:
if "mermaid" in lang_line:
# Protect edge labels (text between link start and arrow) from being modified
# by temporarily replacing them with placeholders.
# Covers all Mermaid link types:
# - Solid line: A -- text --> B, A -- text --o B, A -- text --x B
# - Dotted line: A -. text .-> B, A -. text .-o B
# - Thick line: A == text ==> B, A == text ==o B
# - No arrow: A -- text --- B
edge_labels = []
def protect_edge_label(m):
@@ -404,7 +608,6 @@ class ContentNormalizer:
edge_labels.append((start, label, arrow))
return f"___EDGE_LABEL_{len(edge_labels)-1}___"
# Comprehensive edge label pattern for all Mermaid link types
edge_label_pattern = (
r"(--|-\.|\=\=)\s+(.+?)\s+(--+[>ox]?|--+\|>|\.-[>ox]?|=+[>ox]?)"
)
@@ -435,12 +638,6 @@ class ContentNormalizer:
def _fix_headings(self, content: str) -> str:
"""Fix missing space in headings: #Heading -> # Heading"""
# We only fix if it's not inside a code block.
# But splitting by code block is expensive.
# Given headings usually don't appear inside code blocks without space in valid code (except comments),
# we might risk false positives in comments like `#TODO`.
# To be safe, let's split by code blocks.
parts = content.split("```")
for i in range(0, len(parts), 2): # Even indices are markdown text
parts[i] = self._PATTERNS["heading_space"].sub(r"\1 \2", parts[i])
@@ -467,44 +664,34 @@ class ContentNormalizer:
inner = match.group("inner")
# Recursive step: Fix emphasis spacing INSIDE the current block first
# This ensures that ** _ italic _ ** becomes ** _italic_ ** before we strip outer spaces.
inner = self._PATTERNS["emphasis_spacing"].sub(replacer, inner)
# If no leading/trailing whitespace, nothing to fix at this level
stripped_inner = inner.strip()
if stripped_inner == inner:
return f"{symbol}{inner}{symbol}"
# Safeguard: If inner content is just whitespace, don't touch it
if not stripped_inner:
return match.group(0)
# Safeguard: If it looks like a math expression or list of variables (e.g. " * 3 * " or " _ b _ ")
# If the symbol is surrounded by spaces in the original text, it's likely an operator.
# Heuristic checks
if inner.startswith(" ") and inner.endswith(" "):
# If it's single '*' or '_', and both sides have spaces, it's almost certainly an operator.
if symbol in ["*", "_"]:
return match.group(0)
if symbol == "*":
if not any(c.isalpha() for c in inner):
return match.group(0)
# Safeguard: List marker protection
# If symbol is single '*' and inner content starts with whitespace followed by emphasis markers,
# this is likely a list item like "* **bold**" - don't merge them.
# Pattern: "* **text**" should NOT become "***text**"
if symbol == "*" and inner.lstrip().startswith(("*", "_")):
return match.group(0)
# Extended list marker protection:
# If symbol is single '*' and inner starts with multiple spaces (list indentation pattern),
# this is likely a list item like "* text" - don't strip the spaces.
# Pattern: "* U16 forward **Kuang**" should NOT become "*U16 forward **Kuang**"
if symbol == "*" and inner.startswith(" "):
return match.group(0)
if symbol in stripped_inner:
return match.group(0)
return f"{symbol}{stripped_inner}{symbol}"
parts = content.split("```")
for i in range(0, len(parts), 2): # Even indices are markdown text
# We use a while loop to handle overlapping or multiple occurrences at the top level
for i in range(0, len(parts), 2):
while True:
new_part = self._PATTERNS["emphasis_spacing"].sub(replacer, parts[i])
if new_part == parts[i]:
@@ -517,82 +704,201 @@ class Filter:
class Valves(BaseModel):
priority: int = Field(
default=50,
description="Priority level. Higher runs later (recommended to run after other filters).",
description="Priority level (lower = earlier).",
)
enable_escape_fix: bool = Field(
default=True, description="Fix excessive escape characters (\\n, \\t, etc.)"
default=True,
description="Fix excessive escape characters (\\n, \\t, etc.).",
)
enable_escape_fix_in_code_blocks: bool = Field(
default=False,
description="Apply escape fix inside code blocks (⚠️ Warning: May break valid code like JSON strings or regex patterns. Default: False for safety)",
description="Apply escape fix inside code blocks (Warning: May break valid code).",
)
enable_thought_tag_fix: bool = Field(
default=True, description="Normalize </thought> tags"
default=True,
description="Normalize thought tags (<think> -> <thought>).",
)
enable_details_tag_fix: bool = Field(
default=True,
description="Normalize <details> tags (add blank line after </details> and handle self-closing tags)",
description="Normalize <details> tags (add blank line after closing tag).",
)
enable_code_block_fix: bool = Field(
default=True,
description="Fix code block formatting (indentation, newlines)",
description="Fix code block formatting (indentation, newlines).",
)
enable_latex_fix: bool = Field(
default=True, description="Normalize LaTeX formulas (\\[ -> $$, \\( -> $)"
default=True,
description="Normalize LaTeX formulas (\\[ -> $$, \\( -> $).",
)
enable_list_fix: bool = Field(
default=False, description="Fix list item newlines (Experimental)"
default=False,
description="Fix list item newlines (Experimental).",
)
enable_unclosed_block_fix: bool = Field(
default=True, description="Auto-close unclosed code blocks"
default=True,
description="Auto-close unclosed code blocks.",
)
enable_fullwidth_symbol_fix: bool = Field(
default=False, description="Fix full-width symbols in code blocks"
default=False,
description="Fix full-width symbols in code blocks.",
)
enable_mermaid_fix: bool = Field(
default=True,
description="Fix common Mermaid syntax errors (e.g. unquoted labels)",
description="Fix common Mermaid syntax errors (e.g. unquoted labels).",
)
enable_heading_fix: bool = Field(
default=True,
description="Fix missing space in headings (#Header -> # Header)",
description="Fix missing space in headings (#Header -> # Header).",
)
enable_table_fix: bool = Field(
default=True, description="Fix missing closing pipe in tables"
default=True,
description="Fix missing closing pipe in tables.",
)
enable_xml_tag_cleanup: bool = Field(
default=True, description="Cleanup leftover XML tags"
default=True,
description="Cleanup leftover XML tags.",
)
enable_emphasis_spacing_fix: bool = Field(
default=False,
description="Fix spaces inside **emphasis** (e.g. ** text ** -> **text**)",
description="Fix spaces inside **emphasis** (e.g. ** text ** -> **text**).",
)
show_status: bool = Field(
default=True, description="Show status notification when fixes are applied"
default=True,
description="Show status notification when fixes are applied.",
)
show_debug_log: bool = Field(
default=True, description="Print debug logs to browser console (F12)"
default=True,
description="Print debug logs to browser console (F12).",
)
def __init__(self):
self.valves = self.Valves()
self.fallback_map = {
"zh": "zh-CN",
"en": "en-US",
"ko": "ko-KR",
"ja": "ja-JP",
"fr": "fr-FR",
"de": "de-DE",
"es": "es-ES",
"it": "it-IT",
"vi": "vi-VN",
"id": "id-ID",
"es-AR": "es-ES",
"es-MX": "es-ES",
"fr-CA": "fr-FR",
"en-CA": "en-US",
"en-GB": "en-US",
"en-AU": "en-US",
"de-AT": "de-DE",
}
def _resolve_language(self, lang: str) -> str:
"""Resolve the best matching language code from the TRANSLATIONS dict."""
target_lang = lang
# 1. Direct match
if target_lang in TRANSLATIONS:
return target_lang
# 2. Variant fallback (explicit mapping)
if target_lang in self.fallback_map:
target_lang = self.fallback_map[target_lang]
if target_lang in TRANSLATIONS:
return target_lang
# 3. Base language fallback (e.g. fr-BE -> fr-FR)
if "-" in lang:
base_lang = lang.split("-")[0]
for supported_lang in TRANSLATIONS:
if supported_lang.startswith(base_lang + "-"):
return supported_lang
# 4. Final Fallback to en-US
return "en-US"
def _get_translation(self, lang: str, key: str, **kwargs) -> str:
"""Get translated string for the given language and key."""
target_lang = self._resolve_language(lang)
# Retrieve dictionary
lang_dict = TRANSLATIONS.get(target_lang, TRANSLATIONS["en-US"])
# Get string
text = lang_dict.get(key, TRANSLATIONS["en-US"].get(key, key))
# Format if arguments provided
if kwargs:
try:
text = text.format(**kwargs)
except Exception as e:
logger.warning(f"Translation formatting failed for {key}: {e}")
return text
async def _get_user_context(
self,
__user__: Optional[dict],
__event_call__: Optional[Callable] = None,
__request__: Optional[Request] = None,
) -> dict:
"""
Robust extraction of user context with multi-level fallback for language detection.
Priority: localStorage (via JS) > HTTP headers > User profile > en-US
"""
user_data = __user__ if isinstance(__user__, dict) else {}
user_id = user_data.get("id", "unknown_user")
user_name = user_data.get("name", "User")
user_language = user_data.get("language", "en-US")
# 1. Fallback: HTTP Accept-Language header
if __request__ and hasattr(__request__, "headers"):
accept_lang = __request__.headers.get("accept-language", "")
if accept_lang:
user_language = accept_lang.split(",")[0].split(";")[0]
# 2. Priority: Frontend localStorage via JS (requires timeout protection)
if __event_call__:
try:
js_code = """
try {
return (
document.documentElement.lang ||
localStorage.getItem('locale') ||
navigator.language ||
'en-US'
);
} catch (e) {
return 'en-US';
}
"""
# MUST use wait_for with timeout to prevent backend deadlock
frontend_lang = await asyncio.wait_for(
__event_call__({"type": "execute", "data": {"code": js_code}}),
timeout=2.0,
)
if frontend_lang and isinstance(frontend_lang, str):
user_language = frontend_lang
except Exception:
pass # Fallback to existing language
return {
"user_id": user_id,
"user_name": user_name,
"user_language": user_language,
}
def _get_chat_context(
self, body: dict, __metadata__: Optional[dict] = None
) -> Dict[str, str]:
"""
Unified extraction of chat context information (chat_id, message_id).
Prioritizes extraction from body, then metadata.
"""
"""Unified extraction of chat context information"""
chat_id = ""
message_id = ""
# 1. Try to get from body
if isinstance(body, dict):
chat_id = body.get("chat_id", "")
message_id = body.get("id", "") # message_id is usually 'id' in body
message_id = body.get("id", "")
# Check body.metadata as fallback
if not chat_id or not message_id:
body_metadata = body.get("metadata", {})
if isinstance(body_metadata, dict):
@@ -601,7 +907,6 @@ class Filter:
if not message_id:
message_id = body_metadata.get("message_id", "")
# 2. Try to get from __metadata__ (as supplement)
if __metadata__ and isinstance(__metadata__, dict):
if not chat_id:
chat_id = __metadata__.get("chat_id", "")
@@ -614,19 +919,42 @@ class Filter:
}
def _contains_html(self, content: str) -> bool:
"""Check if content contains HTML tags (to avoid breaking HTML output)"""
# Removed common Mermaid-compatible tags like br, b, i, strong, em, span
"""Check if content contains HTML tags"""
pattern = r"<\s*/?\s*(?:html|head|body|div|p|hr|ul|ol|li|table|thead|tbody|tfoot|tr|td|th|img|a|code|pre|blockquote|h[1-6]|script|style|form|input|button|label|select|option|iframe|link|meta|title)\b"
return bool(re.search(pattern, content, re.IGNORECASE))
async def _emit_status(self, __event_emitter__, applied_fixes: List[str]):
"""Emit status notification"""
async def _emit_status(
self, __event_emitter__, applied_fixes: List[str], lang: str
):
"""Emit status notification with i18n support"""
if not self.valves.show_status or not applied_fixes:
return
description = "✓ Markdown Normalized"
if applied_fixes:
description += f": {', '.join(applied_fixes)}"
# Map internal fix IDs to i18n keys
fix_key_map = {
"Fix Escape Chars": "fix_escape",
"Normalize Thought Tags": "fix_thought",
"Normalize Details Tags": "fix_details",
"Fix Code Blocks": "fix_code",
"Normalize LaTeX": "fix_latex",
"Fix List Format": "fix_list",
"Close Code Blocks": "fix_close_code",
"Fix Full-width Symbols": "fix_fullwidth",
"Fix Mermaid Syntax": "fix_mermaid",
"Fix Headings": "fix_heading",
"Fix Tables": "fix_table",
"Cleanup XML Tags": "fix_xml",
"Fix Emphasis Spacing": "fix_emphasis",
"Custom Cleaner": "fix_custom",
}
prefix = self._get_translation(lang, "status_prefix")
translated_fixes = [
self._get_translation(lang, fix_key_map.get(fix, fix))
for fix in applied_fixes
]
description = f"{prefix}: {', '.join(translated_fixes)}"
try:
await __event_emitter__(
@@ -639,7 +967,7 @@ class Filter:
}
)
except Exception as e:
print(f"Error emitting status: {e}")
logger.error(f"Error emitting status: {e}")
async def _emit_debug_log(
self,
@@ -654,7 +982,6 @@ class Filter:
return
try:
# Construct JS code
js_code = f"""
(async function() {{
console.group("🛠️ Markdown Normalizer Debug");
@@ -665,7 +992,6 @@ class Filter:
console.groupEnd();
}})();
"""
await __event_call__(
{
"type": "execute",
@@ -673,7 +999,8 @@ class Filter:
}
)
except Exception as e:
print(f"Error emitting debug log: {e}")
# We don't want to fail the whole normalization if debug logging fails
pass
async def outlet(
self,
@@ -682,21 +1009,17 @@ class Filter:
__event_emitter__=None,
__event_call__=None,
__metadata__: Optional[dict] = None,
__request__: Optional[Request] = None,
) -> dict:
"""
Process the response body to normalize Markdown content.
"""
"""Process response body"""
if "messages" in body and body["messages"]:
last = body["messages"][-1]
content = last.get("content", "") or ""
if last.get("role") == "assistant" and isinstance(content, str):
# Skip if content looks like HTML to avoid breaking it
if self._contains_html(content):
return body
# Skip if content contains tool output markers (native function calling)
# Pattern: ""&quot;...&quot;"" or tool_call_id or <details type="tool_calls"...>
if (
'""&quot;' in content
or "tool_call_id" in content
@@ -704,7 +1027,6 @@ class Filter:
):
return body
# Configure normalizer based on valves
config = NormalizerConfig(
enable_escape_fix=self.valves.enable_escape_fix,
enable_escape_fix_in_code_blocks=self.valves.enable_escape_fix_in_code_blocks,
@@ -723,18 +1045,19 @@ class Filter:
)
normalizer = ContentNormalizer(config)
# Execute normalization
new_content = normalizer.normalize(content)
# Update content if changed
if new_content != content:
last["content"] = new_content
# Emit status if enabled
if __event_emitter__:
user_ctx = await self._get_user_context(
__user__, __event_call__, __request__
)
await self._emit_status(
__event_emitter__, normalizer.applied_fixes
__event_emitter__,
normalizer.applied_fixes,
user_ctx["user_language"],
)
chat_ctx = self._get_chat_context(body, __metadata__)
await self._emit_debug_log(

View File

@@ -1,761 +0,0 @@
"""
title: Markdown 格式修复器 (Markdown Normalizer)
author: Fu-Jie
author_url: https://github.com/Fu-Jie/openwebui-extensions
funding_url: https://github.com/open-webui
version: 1.2.4
description: 内容规范化过滤器,修复 LLM 输出中常见的 Markdown 格式问题如损坏的代码块、LaTeX 公式、Mermaid 图表和列表格式。
"""
from pydantic import BaseModel, Field
from typing import Optional, List, Callable, Dict
import re
import logging
import asyncio
import json
from dataclasses import dataclass, field
# Configure logging
logger = logging.getLogger(__name__)
@dataclass
class NormalizerConfig:
"""配置类,用于启用/禁用特定的规范化规则"""
enable_escape_fix: bool = True # 修复过度的转义字符
enable_escape_fix_in_code_blocks: bool = (
False # 在代码块内部应用转义修复 (默认:关闭,以确保安全)
)
enable_thought_tag_fix: bool = True # 规范化思维链标签
enable_details_tag_fix: bool = True # 规范化 <details> 标签(类似思维链标签)
enable_code_block_fix: bool = True # 修复代码块格式
enable_latex_fix: bool = True # 修复 LaTeX 公式格式
enable_list_fix: bool = False # 修复列表项换行 (默认关闭,因为可能过于激进)
enable_unclosed_block_fix: bool = True # 自动闭合未闭合的代码块
enable_fullwidth_symbol_fix: bool = False # 修复代码块中的全角符号
enable_mermaid_fix: bool = True # 修复常见的 Mermaid 语法错误
enable_heading_fix: bool = True # 修复标题中缺失的空格 (#Header -> # Header)
enable_table_fix: bool = True # 修复表格中缺失的闭合管道符
enable_xml_tag_cleanup: bool = True # 清理残留的 XML 标签
enable_emphasis_spacing_fix: bool = False # 修复 **强调内容** 中的多余空格
# 自定义清理函数 (用于高级扩展)
custom_cleaners: List[Callable[[str], str]] = field(default_factory=list)
class ContentNormalizer:
"""LLM Output Content Normalizer - Production Grade Implementation"""
# --- 1. Pre-compiled Regex Patterns (Performance Optimization) ---
_PATTERNS = {
# Code block prefix: if ``` is not at start of line (ignoring whitespace)
"code_block_prefix": re.compile(r"(\S[ \t]*)(```)"),
# Code block suffix: ```lang followed by non-whitespace (no newline)
"code_block_suffix": re.compile(r"(```[\w\+\-\.]*)[ \t]+([^\n\r])"),
# Code block indent: whitespace at start of line + ```
"code_block_indent": re.compile(r"^[ \t]+(```)", re.MULTILINE),
# Thought tag: </thought> followed by optional whitespace/newlines
"thought_end": re.compile(
r"</(thought|think|thinking)>[ \t]*\n*", re.IGNORECASE
),
"thought_start": re.compile(r"<(thought|think|thinking)>", re.IGNORECASE),
# Details tag: </details> followed by optional whitespace/newlines
"details_end": re.compile(r"</details>[ \t]*\n*", re.IGNORECASE),
# Self-closing details tag: <details ... /> followed by optional whitespace (but NOT already having newline)
"details_self_closing": re.compile(
r"(<details[^>]*/\s*>)(?!\n)", re.IGNORECASE
),
# LaTeX block: \[ ... \]
"latex_bracket_block": re.compile(r"\\\[(.+?)\\\]", re.DOTALL),
# LaTeX inline: \( ... \)
"latex_paren_inline": re.compile(r"\\\((.+?)\\\)"),
# List item: non-newline + digit + dot + space
"list_item": re.compile(r"([^\n])(\d+\. )"),
# XML artifacts (e.g. Claude's)
"xml_artifacts": re.compile(
r"</?(?:antArtifact|antThinking|artifact)[^>]*>", re.IGNORECASE
),
# Mermaid: 匹配各种形状的节点并为未加引号的标签添加引号
# 修复"反向优化"问题:必须精确匹配各种形状的定界符,避免破坏形状结构
# 优先级:长定界符优先匹配
"mermaid_node": re.compile(
r'("[^"\\]*(?:\\.[^"\\]*)*")|' # Match quoted strings first (Group 1)
r"(\w+)(?:"
r"(\(\(\()(?![\"])(.*?)(?<![\"])(\)\)\))|" # (((...))) Double Circle
r"(\(\()(?![\"])(.*?)(?<![\"])(\)\))|" # ((...)) Circle
r"(\(\[)(?![\"])(.*?)(?<![\"])(\]\))|" # ([...]) Stadium
r"(\[\()(?![\"])(.*?)(?<![\"])(\)\])|" # [(...)] Cylinder
r"(\[\[)(?![\"])(.*?)(?<![\"])(\]\])|" # [[...]] Subroutine
r"(\{\{)(?![\"])(.*?)(?<![\"])(\}\})|" # {{...}} Hexagon
r"(\[/)(?![\"])(.*?)(?<![\"])(/\])|" # [/.../] Parallelogram
r"(\[\\)(?![\"])(.*?)(?<![\"])(\\\])|" # [\...\] Parallelogram Alt
r"(\[/)(?![\"])(.*?)(?<![\"])(\\\])|" # [/...\] Trapezoid
r"(\[\\)(?![\"])(.*?)(?<![\"])(/\])|" # [\.../] Trapezoid Alt
r"(\()(?![\"])([^)]*?)(?<![\"])(\))|" # (...) Round - Modified to be safer
r"(\[)(?![\"])(.*?)(?<![\"])(\])|" # [...] Square
r"(\{)(?![\"])(.*?)(?<![\"])(\})|" # {...} Rhombus
r"(>)(?![\"])(.*?)(?<![\"])(\])" # >...] Asymmetric
r")"
r"(\s*\[\d+\])?", # Capture optional citation [1]
re.DOTALL,
),
# Heading: #Heading -> # Heading
"heading_space": re.compile(r"^(#+)([^ \n#])", re.MULTILINE),
# Table: | col1 | col2 -> | col1 | col2 |
"table_pipe": re.compile(r"^(\|.*[^|\n])$", re.MULTILINE),
# Emphasis spacing: ** text ** -> **text**, __ text __ -> __text__
# Matches emphasis blocks within a single line. We use a recursive approach
# in _fix_emphasis_spacing to handle nesting and spaces correctly.
# NOTE: We use [^\n] instead of . to prevent cross-line matching.
# Supports: * (italic), ** (bold), *** (bold+italic), _ (italic), __ (bold), ___ (bold+italic)
"emphasis_spacing": re.compile(
r"(?<!\*|_)(\*{1,3}|_{1,3})(?P<inner>[^\n]*?)(\1)(?!\*|_)"
),
}
def __init__(self, config: Optional[NormalizerConfig] = None):
self.config = config or NormalizerConfig()
self.applied_fixes = []
def normalize(self, content: str) -> str:
"""Main entry point: apply all normalization rules in order"""
self.applied_fixes = []
if not content:
return content
original_content = content # Keep a copy for logging
try:
# 1. Escape character fix (Must be first)
if self.config.enable_escape_fix:
original = content
content = self._fix_escape_characters(content)
if content != original:
self.applied_fixes.append("Fix Escape Chars")
# 2. Thought tag normalization
if self.config.enable_thought_tag_fix:
original = content
content = self._fix_thought_tags(content)
if content != original:
self.applied_fixes.append("Normalize Thought Tags")
# 3. Details tag normalization (must be before heading fix)
if self.config.enable_details_tag_fix:
original = content
content = self._fix_details_tags(content)
if content != original:
self.applied_fixes.append("Normalize Details Tags")
# 4. Code block formatting fix
if self.config.enable_code_block_fix:
original = content
content = self._fix_code_blocks(content)
if content != original:
self.applied_fixes.append("Fix Code Blocks")
# 4. LaTeX formula normalization
if self.config.enable_latex_fix:
original = content
content = self._fix_latex_formulas(content)
if content != original:
self.applied_fixes.append("Normalize LaTeX")
# 5. List formatting fix
if self.config.enable_list_fix:
original = content
content = self._fix_list_formatting(content)
if content != original:
self.applied_fixes.append("Fix List Format")
# 6. Unclosed code block fix
if self.config.enable_unclosed_block_fix:
original = content
content = self._fix_unclosed_code_blocks(content)
if content != original:
self.applied_fixes.append("Close Code Blocks")
# 7. Full-width symbol fix (in code blocks only)
if self.config.enable_fullwidth_symbol_fix:
original = content
content = self._fix_fullwidth_symbols_in_code(content)
if content != original:
self.applied_fixes.append("Fix Full-width Symbols")
# 8. Mermaid syntax fix
if self.config.enable_mermaid_fix:
original = content
content = self._fix_mermaid_syntax(content)
if content != original:
self.applied_fixes.append("Fix Mermaid Syntax")
# 9. Heading fix
if self.config.enable_heading_fix:
original = content
content = self._fix_headings(content)
if content != original:
self.applied_fixes.append("Fix Headings")
# 10. Table fix
if self.config.enable_table_fix:
original = content
content = self._fix_tables(content)
if content != original:
self.applied_fixes.append("Fix Tables")
# 11. XML tag cleanup
if self.config.enable_xml_tag_cleanup:
original = content
content = self._cleanup_xml_tags(content)
if content != original:
self.applied_fixes.append("Cleanup XML Tags")
# 12. Emphasis spacing fix
if self.config.enable_emphasis_spacing_fix:
original = content
content = self._fix_emphasis_spacing(content)
if content != original:
self.applied_fixes.append("Fix Emphasis Spacing")
# 9. Custom cleaners
for cleaner in self.config.custom_cleaners:
original = content
content = cleaner(content)
if content != original:
self.applied_fixes.append("Custom Cleaner")
if self.applied_fixes:
print(f"[Markdown Normalizer] Applied fixes: {self.applied_fixes}")
print(
f"[Markdown Normalizer] --- Original Content ---\n{original_content}\n------------------------"
)
print(
f"[Markdown Normalizer] --- Normalized Content ---\n{content}\n--------------------------"
)
return content
except Exception as e:
# Production safeguard: return original content on error
logger.error(f"Content normalization failed: {e}", exc_info=True)
return content
def _fix_escape_characters(self, content: str) -> str:
"""修复过度的转义字符
如果 enable_escape_fix_in_code_blocks 为 False (默认),此方法将仅修复代码块外部的转义字符,
以避免破坏有效的代码示例 (例如,带有 \\n 的 JSON 字符串、正则表达式模式等)。
"""
if self.config.enable_escape_fix_in_code_blocks:
# 全局应用 (原始行为)
content = content.replace("\\r\\n", "\n")
content = content.replace("\\n", "\n")
content = content.replace("\\t", "\t")
content = content.replace("\\\\", "\\")
return content
else:
# 仅在代码块外部应用 (安全模式)
parts = content.split("```")
for i in range(0, len(parts), 2): # 偶数索引是 Markdown 文本 (非代码)
parts[i] = parts[i].replace("\\r\\n", "\n")
parts[i] = parts[i].replace("\\n", "\n")
parts[i] = parts[i].replace("\\t", "\t")
parts[i] = parts[i].replace("\\\\", "\\")
return "```".join(parts)
def _fix_thought_tags(self, content: str) -> str:
"""Normalize thought tags: unify naming and fix spacing"""
# 1. Standardize start tag: <think>, <thinking> -> <thought>
content = self._PATTERNS["thought_start"].sub("<thought>", content)
# 2. Standardize end tag and ensure newlines: </think> -> </thought>\n\n
return self._PATTERNS["thought_end"].sub("</thought>\n\n", content)
def _fix_details_tags(self, content: str) -> str:
"""规范化 <details> 标签:确保闭合标签后的正确间距
处理两种情况:
1. </details> 后跟内容 -> 确保有双换行
2. <details .../> (自闭合) 后跟内容 -> 确保有换行
注意:仅在代码块外部应用,以避免破坏代码示例。
"""
parts = content.split("```")
for i in range(0, len(parts), 2): # 偶数索引是 Markdown 文本
# 1. 确保 </details> 后有双换行
parts[i] = self._PATTERNS["details_end"].sub("</details>\n\n", parts[i])
# 2. 确保自闭合 <details ... /> 后有换行
parts[i] = self._PATTERNS["details_self_closing"].sub(r"\1\n", parts[i])
return "```".join(parts)
def _fix_code_blocks(self, content: str) -> str:
"""Fix code block formatting (prefixes, suffixes, indentation)"""
# Ensure newline before ```
content = self._PATTERNS["code_block_prefix"].sub(r"\n\1", content)
# Ensure newline after ```lang
content = self._PATTERNS["code_block_suffix"].sub(r"\1\n\2", content)
return content
def _fix_latex_formulas(self, content: str) -> str:
"""Normalize LaTeX formulas: \[ -> $$ (block), \( -> $ (inline)"""
content = self._PATTERNS["latex_bracket_block"].sub(r"$$\1$$", content)
content = self._PATTERNS["latex_paren_inline"].sub(r"$\1$", content)
return content
def _fix_list_formatting(self, content: str) -> str:
"""Fix missing newlines in lists (e.g., 'text1. item' -> 'text\\n1. item')"""
return self._PATTERNS["list_item"].sub(r"\1\n\2", content)
def _fix_unclosed_code_blocks(self, content: str) -> str:
"""Auto-close unclosed code blocks"""
if content.count("```") % 2 != 0:
content += "\n```"
return content
def _fix_fullwidth_symbols_in_code(self, content: str) -> str:
"""Convert full-width symbols to half-width inside code blocks"""
FULLWIDTH_MAP = {
"": ",",
"": ".",
"": "(",
"": ")",
"": "[",
"": "]",
"": ";",
"": ":",
"": "?",
"": "!",
"": '"',
"": '"',
"": "'",
"": "'",
}
parts = content.split("```")
# Code block content is at odd indices: 1, 3, 5...
for i in range(1, len(parts), 2):
for full, half in FULLWIDTH_MAP.items():
parts[i] = parts[i].replace(full, half)
return "```".join(parts)
def _fix_mermaid_syntax(self, content: str) -> str:
"""修复常见的 Mermaid 语法错误,同时保留节点形状"""
def replacer(match):
# Group 1 is Quoted String (if matched)
if match.group(1):
return match.group(1)
# Group 2 is ID
id_str = match.group(2)
# Find matching shape group
groups = match.groups()
citation = groups[-1] or "" # Last group is citation
# Iterate over shape groups (excluding the last citation group)
for i in range(2, len(groups) - 1, 3):
if groups[i] is not None:
open_char = groups[i]
content = groups[i + 1]
close_char = groups[i + 2]
# Append citation to content if present
if citation:
content += citation
# 如果内容包含引号,进行转义
content = content.replace('"', '\\"')
return f'{id_str}{open_char}"{content}"{close_char}'
return match.group(0)
parts = content.split("```")
for i in range(1, len(parts), 2):
# Check if it's a mermaid block
lang_line = parts[i].split("\n", 1)[0].strip().lower()
if "mermaid" in lang_line:
# Protect edge labels (text between link start and arrow) from being modified
# by temporarily replacing them with placeholders.
# Covers all Mermaid link types:
# - Solid line: A -- text --> B, A -- text --o B, A -- text --x B
# - Dotted line: A -. text .-> B, A -. text .-o B
# - Thick line: A == text ==> B, A == text ==o B
# - No arrow: A -- text --- B
edge_labels = []
def protect_edge_label(m):
start = m.group(1) # Link start: --, -., or ==
label = m.group(2) # Text content
arrow = m.group(3) # Arrow/end pattern
edge_labels.append((start, label, arrow))
return f"___EDGE_LABEL_{len(edge_labels)-1}___"
# Comprehensive edge label pattern for all Mermaid link types
edge_label_pattern = (
r"(--|-\.|\=\=)\s+(.+?)\s+(--+[>ox]?|--+\|>|\.-[>ox]?|=+[>ox]?)"
)
protected = re.sub(edge_label_pattern, protect_edge_label, parts[i])
# Apply the comprehensive regex fix to protected content
fixed = self._PATTERNS["mermaid_node"].sub(replacer, protected)
# Restore edge labels
for idx, (start, label, arrow) in enumerate(edge_labels):
fixed = fixed.replace(
f"___EDGE_LABEL_{idx}___", f"{start} {label} {arrow}"
)
parts[i] = fixed
# Auto-close subgraphs
# Count 'subgraph' and 'end' (case-insensitive)
# We use a simple regex to avoid matching words inside labels (though labels are now quoted, so it's safer)
# But for simplicity and speed, we just count occurrences in the whole block.
# A more robust way would be to strip quoted strings first, but that's expensive.
# Given we just quoted everything, let's try to count keywords outside quotes?
# Actually, since we just normalized nodes, most text is in quotes.
# Let's just do a simple count. It's a heuristic fix.
subgraph_count = len(
re.findall(r"\bsubgraph\b", parts[i], re.IGNORECASE)
)
end_count = len(re.findall(r"\bend\b", parts[i], re.IGNORECASE))
if subgraph_count > end_count:
missing_ends = subgraph_count - end_count
parts[i] = parts[i].rstrip() + ("\n end" * missing_ends) + "\n"
return "```".join(parts)
def _fix_headings(self, content: str) -> str:
"""Fix missing space in headings: #Heading -> # Heading"""
# We only fix if it's not inside a code block.
# But splitting by code block is expensive.
# Given headings usually don't appear inside code blocks without space in valid code (except comments),
# we might risk false positives in comments like `#TODO`.
# To be safe, let's split by code blocks.
parts = content.split("```")
for i in range(0, len(parts), 2): # Even indices are markdown text
parts[i] = self._PATTERNS["heading_space"].sub(r"\1 \2", parts[i])
return "```".join(parts)
def _fix_tables(self, content: str) -> str:
"""Fix tables missing closing pipe"""
parts = content.split("```")
for i in range(0, len(parts), 2):
parts[i] = self._PATTERNS["table_pipe"].sub(r"\1|", parts[i])
return "```".join(parts)
def _cleanup_xml_tags(self, content: str) -> str:
"""Remove leftover XML tags"""
return self._PATTERNS["xml_artifacts"].sub("", content)
def _fix_emphasis_spacing(self, content: str) -> str:
"""Fix spaces inside **emphasis** or _emphasis_
Example: ** text ** -> **text**, **text ** -> **text**, ** text** -> **text**
"""
def replacer(match):
symbol = match.group(1)
inner = match.group("inner")
# Recursive step: Fix emphasis spacing INSIDE the current block first
# This ensures that ** _ italic _ ** becomes ** _italic_ ** before we strip outer spaces.
inner = self._PATTERNS["emphasis_spacing"].sub(replacer, inner)
# If no leading/trailing whitespace, nothing to fix at this level
stripped_inner = inner.strip()
if stripped_inner == inner:
return f"{symbol}{inner}{symbol}"
# Safeguard: If inner content is just whitespace, don't touch it
if not stripped_inner:
return match.group(0)
# Safeguard: If it looks like a math expression or list of variables (e.g. " * 3 * " or " _ b _ ")
# If the symbol is surrounded by spaces in the original text, it's likely an operator.
if inner.startswith(" ") and inner.endswith(" "):
# If it's single '*' or '_', and both sides have spaces, it's almost certainly an operator.
if symbol in ["*", "_"]:
return match.group(0)
# Safeguard: List marker protection
# If symbol is single '*' and inner content starts with whitespace followed by emphasis markers,
# this is likely a list item like "* **bold**" - don't merge them.
# Pattern: "* **text**" should NOT become "***text**"
if symbol == "*" and inner.lstrip().startswith(("*", "_")):
return match.group(0)
# Extended list marker protection:
# If symbol is single '*' and inner starts with multiple spaces (list indentation pattern),
# this is likely a list item like "* text" - don't strip the spaces.
# Pattern: "* U16 forward **Kuang**" should NOT become "*U16 forward **Kuang**"
if symbol == "*" and inner.startswith(" "):
return match.group(0)
return f"{symbol}{stripped_inner}{symbol}"
parts = content.split("```")
for i in range(0, len(parts), 2): # Even indices are markdown text
# We use a while loop to handle overlapping or multiple occurrences at the top level
while True:
new_part = self._PATTERNS["emphasis_spacing"].sub(replacer, parts[i])
if new_part == parts[i]:
break
parts[i] = new_part
return "```".join(parts)
class Filter:
class Valves(BaseModel):
priority: int = Field(
default=50,
description="优先级。数值越高运行越晚 (建议在其他过滤器之后运行)。",
)
enable_escape_fix: bool = Field(
default=True, description="修复过度的转义字符 (\\n, \\t 等)"
)
enable_escape_fix_in_code_blocks: bool = Field(
default=False,
description="在代码块内部应用转义修复 (⚠️ 警告:可能会破坏有效的代码,如 JSON 字符串或正则模式。默认:关闭,以确保安全)",
)
enable_thought_tag_fix: bool = Field(
default=True, description="规范化思维链标签 (<think> -> <thought>)"
)
enable_details_tag_fix: bool = Field(
default=True,
description="规范化 <details> 标签 (在 </details> 后添加空行,处理自闭合标签)",
)
enable_code_block_fix: bool = Field(
default=True,
description="修复代码块格式 (缩进、换行)",
)
enable_latex_fix: bool = Field(
default=True, description="规范化 LaTeX 公式 (\\[ -> $$, \\( -> $)"
)
enable_list_fix: bool = Field(
default=False, description="修复列表项换行 (实验性)"
)
enable_unclosed_block_fix: bool = Field(
default=True, description="自动闭合未闭合的代码块"
)
enable_fullwidth_symbol_fix: bool = Field(
default=False, description="修复代码块中的全角符号"
)
enable_mermaid_fix: bool = Field(
default=True,
description="修复常见的 Mermaid 语法错误 (如未加引号的标签)",
)
enable_heading_fix: bool = Field(
default=True,
description="修复标题中缺失的空格 (#Header -> # Header)",
)
enable_table_fix: bool = Field(
default=True, description="修复表格中缺失的闭合管道符"
)
enable_xml_tag_cleanup: bool = Field(
default=True, description="清理残留的 XML 标签"
)
enable_emphasis_spacing_fix: bool = Field(
default=False,
description="修复强调语法中的多余空格 (例如 ** 文本 ** -> **文本**)",
)
show_status: bool = Field(default=True, description="应用修复时显示状态通知")
show_debug_log: bool = Field(
default=True, description="在浏览器控制台打印调试日志 (F12)"
)
def __init__(self):
self.valves = self.Valves()
def _get_chat_context(
self, body: dict, __metadata__: Optional[dict] = None
) -> Dict[str, str]:
"""
统一提取聊天上下文信息 (chat_id, message_id)。
优先从 body 中提取,其次从 metadata 中提取。
"""
chat_id = ""
message_id = ""
# 1. 尝试从 body 获取
if isinstance(body, dict):
chat_id = body.get("chat_id", "")
message_id = body.get("id", "") # message_id 在 body 中通常是 id
# 再次检查 body.metadata
if not chat_id or not message_id:
body_metadata = body.get("metadata", {})
if isinstance(body_metadata, dict):
if not chat_id:
chat_id = body_metadata.get("chat_id", "")
if not message_id:
message_id = body_metadata.get("message_id", "")
# 2. 尝试从 __metadata__ 获取 (作为补充)
if __metadata__ and isinstance(__metadata__, dict):
if not chat_id:
chat_id = __metadata__.get("chat_id", "")
if not message_id:
message_id = __metadata__.get("message_id", "")
return {
"chat_id": str(chat_id).strip(),
"message_id": str(message_id).strip(),
}
def _contains_html(self, content: str) -> bool:
"""Check if content contains HTML tags (to avoid breaking HTML output)"""
# Removed common Mermaid-compatible tags like br, b, i, strong, em, span
pattern = r"<\s*/?\s*(?:html|head|body|div|p|hr|ul|ol|li|table|thead|tbody|tfoot|tr|td|th|img|a|code|pre|blockquote|h[1-6]|script|style|form|input|button|label|select|option|iframe|link|meta|title)\b"
return bool(re.search(pattern, content, re.IGNORECASE))
async def _emit_status(self, __event_emitter__, applied_fixes: List[str]):
"""Emit status notification"""
if not self.valves.show_status or not applied_fixes:
return
description = "✓ Markdown 已修复"
if applied_fixes:
# Translate fix names for status display
fix_map = {
"Fix Escape Chars": "转义字符",
"Normalize Thought Tags": "思维标签",
"Normalize Details Tags": "Details标签",
"Fix Code Blocks": "代码块",
"Normalize LaTeX": "LaTeX公式",
"Fix List Format": "列表格式",
"Close Code Blocks": "闭合代码块",
"Fix Full-width Symbols": "全角符号",
"Fix Mermaid Syntax": "Mermaid语法",
"Fix Headings": "标题格式",
"Fix Tables": "表格格式",
"Cleanup XML Tags": "XML清理",
"Fix Emphasis Spacing": "强调空格",
"Custom Cleaner": "自定义清理",
}
translated_fixes = [fix_map.get(fix, fix) for fix in applied_fixes]
description += f": {', '.join(translated_fixes)}"
try:
await __event_emitter__(
{
"type": "status",
"data": {
"description": description,
"done": True,
},
}
)
except Exception as e:
print(f"Error emitting status: {e}")
async def _emit_debug_log(
self,
__event_call__,
applied_fixes: List[str],
original: str,
normalized: str,
chat_id: str = "",
):
"""Emit debug log to browser console via JS execution"""
if not self.valves.show_debug_log or not __event_call__:
return
try:
# Construct JS code
js_code = f"""
(async function() {{
console.group("🛠️ Markdown Normalizer Debug");
console.log("Chat ID:", {json.dumps(chat_id)});
console.log("Applied Fixes:", {json.dumps(applied_fixes, ensure_ascii=False)});
console.log("Original Content:", {json.dumps(original, ensure_ascii=False)});
console.log("Normalized Content:", {json.dumps(normalized, ensure_ascii=False)});
console.groupEnd();
}})();
"""
await __event_call__(
{
"type": "execute",
"data": {"code": js_code},
}
)
except Exception as e:
print(f"Error emitting debug log: {e}")
async def outlet(
self,
body: dict,
__user__: Optional[dict] = None,
__event_emitter__=None,
__event_call__=None,
__metadata__: Optional[dict] = None,
) -> dict:
"""
Process the response body to normalize Markdown content.
"""
if "messages" in body and body["messages"]:
last = body["messages"][-1]
content = last.get("content", "") or ""
if last.get("role") == "assistant" and isinstance(content, str):
# 如果内容看起来像 HTML则跳过以避免破坏它
if self._contains_html(content):
return body
# 如果内容包含工具输出标记 (原生函数调用),则跳过
# 模式:""&quot;...&quot;"" 或 tool_call_id 或 <details type="tool_calls"...>
if (
'""&quot;' in content
or "tool_call_id" in content
or '<details type="tool_calls"' in content
):
return body
# 根据 Valves 配置 Normalizer
config = NormalizerConfig(
enable_escape_fix=self.valves.enable_escape_fix,
enable_escape_fix_in_code_blocks=self.valves.enable_escape_fix_in_code_blocks,
enable_thought_tag_fix=self.valves.enable_thought_tag_fix,
enable_details_tag_fix=self.valves.enable_details_tag_fix,
enable_code_block_fix=self.valves.enable_code_block_fix,
enable_latex_fix=self.valves.enable_latex_fix,
enable_list_fix=self.valves.enable_list_fix,
enable_unclosed_block_fix=self.valves.enable_unclosed_block_fix,
enable_fullwidth_symbol_fix=self.valves.enable_fullwidth_symbol_fix,
enable_mermaid_fix=self.valves.enable_mermaid_fix,
enable_heading_fix=self.valves.enable_heading_fix,
enable_table_fix=self.valves.enable_table_fix,
enable_xml_tag_cleanup=self.valves.enable_xml_tag_cleanup,
enable_emphasis_spacing_fix=self.valves.enable_emphasis_spacing_fix,
)
normalizer = ContentNormalizer(config)
# Execute normalization
new_content = normalizer.normalize(content)
# Update content if changed
if new_content != content:
last["content"] = new_content
# Emit status if enabled
if __event_emitter__:
await self._emit_status(
__event_emitter__, normalizer.applied_fixes
)
chat_ctx = self._get_chat_context(body, __metadata__)
await self._emit_debug_log(
__event_call__,
normalizer.applied_fixes,
content,
new_content,
chat_id=chat_ctx["chat_id"],
)
return body