feat(filters): upgrade markdown-normalizer to v1.2.7

- Fix Issue #49: resolve greedy regex matching in consecutive emphasis
- Add LaTeX formula protection to prevent corruption of \times, \nu, etc.
- Expand i18n support to 12 languages with strict alignment
- Fix NameError in Request import during testing
This commit is contained in:
fujie
2026-02-24 15:05:25 +08:00
parent 18ada2a177
commit 2da934dd92
16 changed files with 981 additions and 1009 deletions

View File

@@ -0,0 +1,31 @@
import re
def reproduce_bug():
# 模拟 Issue #49 中提到的受损逻辑
# 核心问题在于正则表达式过于贪婪,或者在多次迭代中错误地将两个加粗块中间的部分当作了“带空格的加粗内容”
text = "I **prefer** tea **to** coffee."
# 模拟一个不严谨的、容易跨块匹配的正则
# 它会匹配 ** 开始,中间任意字符,** 结束
buggy_pattern = re.compile(r"(\*\*)(.*?)(\*\*)")
def buggy_fix(content):
# 模拟插件中的 strip 逻辑:它想去掉加粗符号内部的空格
# 但由于正则匹配了 "**prefer** tea **", 这里的 m.group(2) 变成了 "prefer** tea "
return buggy_pattern.sub(lambda m: f"{m.group(1)}{m.group(2).strip()}{m.group(1)}", content)
# 第一次执行:处理了 "**prefer**" -> "**prefer**"
result_1 = buggy_fix(text)
# 第二次执行(模拟 while 循环或重复运行):
# 此时如果正则引擎从第一个加粗的结束符开始匹配到第二个加粗的起始符
# 指针位置: I **prefer**[匹配开始] tea [匹配结束]**to** coffee.
# 就会把 " tea " 两侧的 ** 当作一对,然后 strip() 掉空格
result_2 = buggy_fix(result_1)
print(f"Original: {text}")
print(f"Step 1: {result_1}")
print(f"Step 2: {result_2} (Bug Reproduced!)")
if __name__ == "__main__":
reproduce_bug()

View File

@@ -0,0 +1,28 @@
import re
def reproduce_bug_v2():
# 模拟更接近旧版实际代码的情况
# 旧版代码中循环多次处理,且正则可能在处理嵌套或连续块时出现偏移
text = "I **prefer** tea **to** coffee."
# 这是一个贪婪且不具备前瞻断言的正则
buggy_pattern = re.compile(r"(\*\*)( +)(.*?)( +)(\*\*)")
# 模拟那种“只要看到 ** 且中间有空格就想修”的逻辑
# 如果文本是 "I **prefer** tea **to**"
# 这里的空格出现在 "prefer**" 和 "**to" 之间
content = "I **prefer** tea **to** coffee."
# 错误的匹配尝试:将第一个块的结尾和第二个块的开头误认为是一对
# I **prefer** tea **to**
# ^^ ^^
# A B
# 正则误以为 A 是开始B 是结束
bug_result = re.sub(r"\*\*( +)(.*?)( +)\*\*", r"**\2**", content)
print(f"Input: {content}")
print(f"Output: {bug_result}")
if __name__ == "__main__":
reproduce_bug_v2()

View File

@@ -0,0 +1,44 @@
import sys
import os
# Add plugin dir to path
current_dir = os.path.dirname(os.path.abspath(__file__))
plugin_dir = os.path.abspath(os.path.join(current_dir, "..", "filters", "markdown_normalizer"))
sys.path.append(plugin_dir)
from markdown_normalizer import ContentNormalizer, NormalizerConfig
def test_latex_protection():
# Test case 1: The reported issue with \times
content_1 = r"Calculation: $C(33, 6) \times C(16, 1)$"
config = NormalizerConfig(enable_escape_fix=True)
normalizer = ContentNormalizer(config)
result_1 = normalizer.normalize(content_1)
print("--- Test 1: \times Protection ---")
print(f"Input: {content_1}")
print(f"Output: {result_1}")
if r"\times" in result_1:
print("✅ PASSED")
else:
print("❌ FAILED")
# Test case 2: Other potential collisions like \nu (newline) or \theta (tab?)
# Using raw strings carefully
content_2 = r"Formula: $\theta = \nu + \tau$"
result_2 = normalizer.normalize(content_2)
print("\n--- Test 2: \theta and \nu Protection ---")
print(f"Input: {content_2}")
print(f"Output: {result_2}")
if r"\theta" in result_2 and r"\nu" in result_2:
print("✅ PASSED")
else:
print("❌ FAILED")
if __name__ == "__main__":
test_latex_protection()

View File

@@ -0,0 +1,42 @@
import re
def verify_fix_v126():
# 1. 准备触发 Bug 的测试文本
test_cases = [
"I **prefer** tea **to** coffee.", # 标准 Issue #49 案例
"The **quick** brown **fox** jumps **over**.", # 多个加粗块
"** text ** and ** more **", # 需要修复的内部空格
"Calculations: 2 * 3 * 4 = 24", # 不应被识别为强调的数学公式
]
# 2. 使用 v1.2.6 中的核心正则表达式 (移除了可能引起解析错误的中文注释)
# 模式: (?<!\*|_)(\*{1,3}|_{1,3})(?P<inner>(?:(?!\1)[^\n])*?)(\1)(?!\*|_)
pattern_str = r"(?<!\*|_)(\*{1,3}|_{1,3})(?P<inner>(?:(?!\1)[^\n])*?)(\1)(?!\*|_)"
FIX_REGEX = re.compile(pattern_str)
def fixed_normalizer(content):
def replacer(match):
symbol = match.group(1)
inner = match.group("inner")
stripped_inner = inner.strip()
# 只有当确实有空格需要修,且内部不是空的才修复
if stripped_inner != inner and stripped_inner:
return f"{symbol}{stripped_inner}{symbol}"
return match.group(0)
# 模拟插件循环处理
for _ in range(2):
content = FIX_REGEX.sub(replacer, content)
return content
print("--- v1.2.6 Fix Verification ---")
for text in test_cases:
result = fixed_normalizer(text)
print(f"Input: {text}")
print(f"Output: {result}")
print("-" * 30)
if __name__ == "__main__":
verify_fix_v126()