feat(filters): upgrade markdown-normalizer to v1.2.7

- Fix Issue #49: resolve greedy regex matching in consecutive emphasis - Add LaTeX formula protection to prevent corruption of \times, \nu, etc. - Expand i18n support to 12 languages with strict alignment - Fix NameError in Request import during testing
2026-02-24 15:05:25 +08:00
parent 18ada2a177
commit 2da934dd92
16 changed files with 981 additions and 1009 deletions
--- a/plugins/debug/reproduce_issue_49.py
+++ b/plugins/debug/reproduce_issue_49.py
@@ -0,0 +1,31 @@
+import re
+
+def reproduce_bug():
+    # 模拟 Issue #49 中提到的受损逻辑
+    # 核心问题在于正则表达式过于贪婪，或者在多次迭代中错误地将两个加粗块中间的部分当作了“带空格的加粗内容”
+    text = "I **prefer** tea **to** coffee."
+    
+    # 模拟一个不严谨的、容易跨块匹配的正则
+    # 它会匹配 ** 开始，中间任意字符，** 结束
+    buggy_pattern = re.compile(r"(\*\*)(.*?)(\*\*)")
+
+    def buggy_fix(content):
+        # 模拟插件中的 strip 逻辑：它想去掉加粗符号内部的空格
+        # 但由于正则匹配了 "**prefer** tea **", 这里的 m.group(2) 变成了 "prefer** tea "
+        return buggy_pattern.sub(lambda m: f"{m.group(1)}{m.group(2).strip()}{m.group(1)}", content)
+
+    # 第一次执行：处理了 "**prefer**" -> "**prefer**"
+    result_1 = buggy_fix(text)
+    
+    # 第二次执行（模拟 while 循环或重复运行）：
+    # 此时如果正则引擎从第一个加粗的结束符开始匹配到第二个加粗的起始符
+    # 指针位置: I **prefer**[匹配开始] tea [匹配结束]**to** coffee.
+    # 就会把 " tea " 两侧的 ** 当作一对，然后 strip() 掉空格
+    result_2 = buggy_fix(result_1)
+    
+    print(f"Original: {text}")
+    print(f"Step 1:   {result_1}")
+    print(f"Step 2:   {result_2} (Bug Reproduced!)")
+
+if __name__ == "__main__":
+    reproduce_bug()
--- a/plugins/debug/reproduce_issue_49_v2.py
+++ b/plugins/debug/reproduce_issue_49_v2.py
@@ -0,0 +1,28 @@
+import re
+
+def reproduce_bug_v2():
+    # 模拟更接近旧版实际代码的情况
+    # 旧版代码中循环多次处理，且正则可能在处理嵌套或连续块时出现偏移
+    text = "I **prefer** tea **to** coffee."
+    
+    # 这是一个贪婪且不具备前瞻断言的正则
+    buggy_pattern = re.compile(r"(\*\*)( +)(.*?)( +)(\*\*)")
+
+    # 模拟那种“只要看到 ** 且中间有空格就想修”的逻辑
+    # 如果文本是 "I **prefer** tea **to**"
+    # 这里的空格出现在 "prefer**" 和 "**to" 之间
+    content = "I **prefer**  tea  **to** coffee."
+    
+    # 错误的匹配尝试：将第一个块的结尾和第二个块的开头误认为是一对
+    # I **prefer**  tea  **to**
+    #          ^^      ^^ 
+    #          A       B
+    # 正则误以为 A 是开始，B 是结束
+    
+    bug_result = re.sub(r"\*\*( +)(.*?)( +)\*\*", r"**\2**", content)
+    
+    print(f"Input:  {content}")
+    print(f"Output: {bug_result}")
+
+if __name__ == "__main__":
+    reproduce_bug_v2()
--- a/plugins/debug/test_latex_fix.py
+++ b/plugins/debug/test_latex_fix.py
@@ -0,0 +1,44 @@
+import sys
+import os
+
+# Add plugin dir to path
+current_dir = os.path.dirname(os.path.abspath(__file__))
+plugin_dir = os.path.abspath(os.path.join(current_dir, "..", "filters", "markdown_normalizer"))
+sys.path.append(plugin_dir)
+
+from markdown_normalizer import ContentNormalizer, NormalizerConfig
+
+def test_latex_protection():
+    # Test case 1: The reported issue with \times
+    content_1 = r"Calculation: $C(33, 6) \times C(16, 1)$"
+    
+    config = NormalizerConfig(enable_escape_fix=True)
+    normalizer = ContentNormalizer(config)
+    
+    result_1 = normalizer.normalize(content_1)
+    
+    print("--- Test 1: \times Protection ---")
+    print(f"Input:  {content_1}")
+    print(f"Output: {result_1}")
+    
+    if r"\times" in result_1:
+        print("✅ PASSED")
+    else:
+        print("❌ FAILED")
+
+    # Test case 2: Other potential collisions like \nu (newline) or \theta (tab?)
+    # Using raw strings carefully
+    content_2 = r"Formula: $\theta = \nu + \tau$"
+    result_2 = normalizer.normalize(content_2)
+    
+    print("\n--- Test 2: \theta and \nu Protection ---")
+    print(f"Input:  {content_2}")
+    print(f"Output: {result_2}")
+    
+    if r"\theta" in result_2 and r"\nu" in result_2:
+        print("✅ PASSED")
+    else:
+        print("❌ FAILED")
+
+if __name__ == "__main__":
+    test_latex_protection()
--- a/plugins/debug/verify_fix_v126.py
+++ b/plugins/debug/verify_fix_v126.py
@@ -0,0 +1,42 @@
+import re
+
+def verify_fix_v126():
+    # 1. 准备触发 Bug 的测试文本
+    test_cases = [
+        "I **prefer** tea **to** coffee.",                  # 标准 Issue #49 案例
+        "The **quick** brown **fox** jumps **over**.",       # 多个加粗块
+        "** text ** and ** more **",                       # 需要修复的内部空格
+        "Calculations: 2 * 3 * 4 = 24",                     # 不应被识别为强调的数学公式
+    ]
+
+    # 2. 使用 v1.2.6 中的核心正则表达式 (移除了可能引起解析错误的中文注释)
+    # 模式: (?<!\*|_)(\*{1,3}|_{1,3})(?P<inner>(?:(?!\1)[^\n])*?)(\1)(?!\*|_)
+    pattern_str = r"(?<!\*|_)(\*{1,3}|_{1,3})(?P<inner>(?:(?!\1)[^\n])*?)(\1)(?!\*|_)"
+    FIX_REGEX = re.compile(pattern_str)
+
+    def fixed_normalizer(content):
+        def replacer(match):
+            symbol = match.group(1)
+            inner = match.group("inner")
+            
+            stripped_inner = inner.strip()
+            
+            # 只有当确实有空格需要修，且内部不是空的才修复
+            if stripped_inner != inner and stripped_inner:
+                return f"{symbol}{stripped_inner}{symbol}"
+            return match.group(0)
+
+        # 模拟插件循环处理
+        for _ in range(2):
+            content = FIX_REGEX.sub(replacer, content)
+        return content
+
+    print("--- v1.2.6 Fix Verification ---")
+    for text in test_cases:
+        result = fixed_normalizer(text)
+        print(f"Input:  {text}")
+        print(f"Output: {result}")
+        print("-" * 30)
+
+if __name__ == "__main__":
+    verify_fix_v126()