Update Async Context Compression to v1.1.1: Add frontend debug logging and optimize token calculation

2026-01-10 18:47:06 +08:00
parent a9cae535eb
commit 7085e794a3
2 changed files with 542 additions and 212 deletions
--- a/plugins/filters/async-context-compression/async_context_compression.py
+++ b/plugins/filters/async-context-compression/async_context_compression.py
@@ -5,7 +5,7 @@ author: Fu-Jie
 author_url: https://github.com/Fu-Jie
 funding_url: https://github.com/Fu-Jie/awesome-openwebui
 description: Reduces token consumption in long conversations while maintaining coherence through intelligent summarization and message compression.
-version: 1.1.0
+version: 1.1.1
 openwebui_id: b1655bc8-6de9-4cad-8cb5-a6f7829a02ce
 license: MIT
@@ -139,6 +139,10 @@ debug_mode
  Default: true
  Description: Prints detailed debug information to the log. Recommended to set to `false` in production.
 show_debug_log
  Default: false
  Description: Print debug logs to browser console (F12). Useful for frontend debugging.
 🔧 Deployment
 ═══════════════════════════════════════════════════════
@@ -355,6 +359,9 @@ class Filter:
        debug_mode: bool = Field(
            default=True, description="Enable detailed logging for debugging."
        )
        show_debug_log: bool = Field(
            default=False, description="Print debug logs to browser console (F12)"
        )
    def _save_summary(self, chat_id: str, summary: str, compressed_count: int):
        """Saves the summary to the database."""
@@ -516,12 +523,109 @@ class Filter:
        return message
    async def _emit_debug_log(
        self,
        __event_call__,
        chat_id: str,
        original_count: int,
        compressed_count: int,
        summary_length: int,
        kept_first: int,
        kept_last: int,
    ):
        """Emit debug log to browser console via JS execution"""
        if not self.valves.show_debug_log or not __event_call__:
            return
        try:
            # Prepare data for JS
            log_data = {
                "chatId": chat_id,
                "originalCount": original_count,
                "compressedCount": compressed_count,
                "summaryLength": summary_length,
                "keptFirst": kept_first,
                "keptLast": kept_last,
                "ratio": (
                    f"{(1 - compressed_count/original_count)*100:.1f}%"
                    if original_count > 0
                    else "0%"
                ),
            }
            # Construct JS code
            js_code = f"""
                (async function() {{
                    console.group("🗜️ Async Context Compression Debug");
                    console.log("Chat ID:", {json.dumps(chat_id)});
                    console.log("Messages:", {original_count} + " -> " + {compressed_count});
                    console.log("Compression Ratio:", {json.dumps(log_data['ratio'])});
                    console.log("Summary Length:", {summary_length} + " chars");
                    console.log("Configuration:", {{
                        "Keep First": {kept_first},
                        "Keep Last": {kept_last}
                    }});
                    console.groupEnd();
                }})();
            """
            await __event_call__(
                {
                    "type": "execute",
                    "data": {"code": js_code},
                }
            )
        except Exception as e:
            print(f"Error emitting debug log: {e}")
    async def _log(self, message: str, type: str = "info", event_call=None):
        """Unified logging to both backend (print) and frontend (console.log)"""
        # Backend logging
        if self.valves.debug_mode:
            print(message)
        # Frontend logging
        if self.valves.show_debug_log and event_call:
            try:
                css = "color: #3b82f6;"  # Blue default
                if type == "error":
                    css = "color: #ef4444; font-weight: bold;"  # Red
                elif type == "warning":
                    css = "color: #f59e0b;"  # Orange
                elif type == "success":
                    css = "color: #10b981; font-weight: bold;"  # Green
                # Clean message for frontend: remove separators and extra newlines
                lines = message.split("\n")
                # Keep lines that don't start with lots of equals or hyphens
                filtered_lines = [
                    line
                    for line in lines
                    if not line.strip().startswith("====")
                    and not line.strip().startswith("----")
                ]
                clean_message = "\n".join(filtered_lines).strip()
                if not clean_message:
                    return
                # Escape quotes in message for JS string
                safe_message = clean_message.replace('"', '\\"').replace("\n", "\\n")
                js_code = f"""
                    console.log("%c[Compression] {safe_message}", "{css}");
                """
                await event_call({"type": "execute", "data": {"code": js_code}})
            except Exception as e:
                print(f"Failed to emit log to frontend: {e}")
    async def inlet(
        self,
        body: dict,
        __user__: Optional[dict] = None,
        __metadata__: dict = None,
        __event_emitter__: Callable[[Any], Awaitable[None]] = None,
        __event_call__: Callable[[Any], Awaitable[None]] = None,
    ) -> dict:
        """
        Executed before sending to the LLM.
@@ -530,10 +634,11 @@ class Filter:
        messages = body.get("messages", [])
        chat_id = __metadata__["chat_id"]
-        if self.valves.debug_mode:
+        if self.valves.debug_mode or self.valves.show_debug_log:
-            print(f"\n{'='*60}")
+            await self._log(
-            print(f"[Inlet] Chat ID: {chat_id}")
+                f"\n{'='*60}\n[Inlet] Chat ID: {chat_id}\n[Inlet] Received {len(messages)} messages",
-            print(f"[Inlet] Received {len(messages)} messages")
+                event_call=__event_call__,
            )
        # Record the target compression progress for the original messages, for use in outlet
        # Target is to compress up to the (total - keep_last) message
@@ -541,17 +646,18 @@ class Filter:
        # [Optimization] Simple state cleanup check
        if chat_id in self.temp_state:
-            if self.valves.debug_mode:
+            await self._log(
-                print(
+                f"[Inlet] ⚠️ Overwriting unconsumed old state (Chat ID: {chat_id})",
-                    f"[Inlet] ⚠️ Overwriting unconsumed old state (Chat ID: {chat_id})"
+                type="warning",
-                )
+                event_call=__event_call__,
            )
        self.temp_state[chat_id] = target_compressed_count
-        if self.valves.debug_mode:
+        await self._log(
-            print(
+            f"[Inlet] Recorded target compression progress: {target_compressed_count}",
-                f"[Inlet] Recorded target compression progress: {target_compressed_count}"
+            event_call=__event_call__,
-            )
+        )
        # Load summary record
        summary_record = await asyncio.to_thread(self._load_summary_record, chat_id)
@@ -600,19 +706,32 @@ class Filter:
                    }
                )
-            if self.valves.debug_mode:
+            await self._log(
-                print(
+                f"[Inlet] Applied summary: Head({len(head_messages)}) + Summary + Tail({len(tail_messages)})",
-                    f"[Inlet] Applied summary: Head({len(head_messages)}) + Summary + Tail({len(tail_messages)})"
+                type="success",
-                )
+                event_call=__event_call__,
            )
            # Emit debug log to frontend (Keep the structured log as well)
            await self._emit_debug_log(
                __event_call__,
                chat_id,
                len(messages),
                len(final_messages),
                len(summary_record.summary),
                self.valves.keep_first,
                self.valves.keep_last,
            )
        else:
            # No summary, use original messages
            final_messages = messages
        body["messages"] = final_messages
-        if self.valves.debug_mode:
+        await self._log(
-            print(f"[Inlet] Final send: {len(body['messages'])} messages")
+            f"[Inlet] Final send: {len(body['messages'])} messages\n{'='*60}\n",
-            print(f"{'='*60}\n")
+            event_call=__event_call__,
        )
        return body
@@ -622,6 +741,7 @@ class Filter:
        __user__: Optional[dict] = None,
        __metadata__: dict = None,
        __event_emitter__: Callable[[Any], Awaitable[None]] = None,
        __event_call__: Callable[[Any], Awaitable[None]] = None,
    ) -> dict:
        """
        Executed after the LLM response is complete.
@@ -630,21 +750,23 @@ class Filter:
        chat_id = __metadata__["chat_id"]
        model = body.get("model", "gpt-3.5-turbo")
-        if self.valves.debug_mode:
+        if self.valves.debug_mode or self.valves.show_debug_log:
-            print(f"\n{'='*60}")
+            await self._log(
-            print(f"[Outlet] Chat ID: {chat_id}")
+                f"\n{'='*60}\n[Outlet] Chat ID: {chat_id}\n[Outlet] Response complete",
-            print(f"[Outlet] Response complete")
+                event_call=__event_call__,
            )
        # Process Token calculation and summary generation asynchronously in the background (do not wait for completion, do not affect output)
        asyncio.create_task(
            self._check_and_generate_summary_async(
-                chat_id, model, body, __user__, __event_emitter__
+                chat_id, model, body, __user__, __event_emitter__, __event_call__
            )
        )
-        if self.valves.debug_mode:
+        await self._log(
-            print(f"[Outlet] Background processing started")
+            f"[Outlet] Background processing started\n{'='*60}\n",
-            print(f"{'='*60}\n")
+            event_call=__event_call__,
        )
        return body
@@ -655,6 +777,7 @@ class Filter:
        body: dict,
        user_data: Optional[dict],
        __event_emitter__: Callable[[Any], Awaitable[None]] = None,
        __event_call__: Callable[[Any], Awaitable[None]] = None,
    ):
        """
        Background processing: Calculates Token count and generates summary (does not block response).
@@ -668,36 +791,50 @@ class Filter:
                "compression_threshold_tokens", self.valves.compression_threshold_tokens
            )
-            if self.valves.debug_mode:
+            await self._log(
-                print(f"\n[🔍 Background Calculation] Starting Token count...")
+                f"\n[🔍 Background Calculation] Starting Token count...",
                event_call=__event_call__,
            )
            # Calculate Token count in a background thread
            current_tokens = await asyncio.to_thread(
                self._calculate_messages_tokens, messages
            )
-            if self.valves.debug_mode:
+            await self._log(
-                print(f"[🔍 Background Calculation] Token count: {current_tokens}")
+                f"[🔍 Background Calculation] Token count: {current_tokens}",
                event_call=__event_call__,
            )
            # Check if compression is needed
            if current_tokens >= compression_threshold_tokens:
-                if self.valves.debug_mode:
+                await self._log(
-                    print(
+                    f"[🔍 Background Calculation] ⚡ Compression threshold triggered (Token: {current_tokens} >= {compression_threshold_tokens})",
-                        f"[🔍 Background Calculation] ⚡ Compression threshold triggered (Token: {current_tokens} >= {compression_threshold_tokens})"
+                    type="warning",
-                    )
+                    event_call=__event_call__,
                )
                # Proceed to generate summary
                await self._generate_summary_async(
-                    messages, chat_id, body, user_data, __event_emitter__
+                    messages,
                    chat_id,
                    body,
                    user_data,
                    __event_emitter__,
                    __event_call__,
                )
            else:
-                if self.valves.debug_mode:
+                await self._log(
-                    print(
+                    f"[🔍 Background Calculation] Compression threshold not reached (Token: {current_tokens} < {compression_threshold_tokens})",
-                        f"[🔍 Background Calculation] Compression threshold not reached (Token: {current_tokens} < {compression_threshold_tokens})"
+                    event_call=__event_call__,
-                    )
+                )
        except Exception as e:
-            print(f"[🔍 Background Calculation] ❌ Error: {str(e)}")
+            await self._log(
                f"[🔍 Background Calculation] ❌ Error: {str(e)}",
                type="error",
                event_call=__event_call__,
            )
    async def _generate_summary_async(
        self,
@@ -706,6 +843,7 @@ class Filter:
        body: dict,
        user_data: Optional[dict],
        __event_emitter__: Callable[[Any], Awaitable[None]] = None,
        __event_call__: Callable[[Any], Awaitable[None]] = None,
    ):
        """
        Generates summary asynchronously (runs in background, does not block response).
@@ -715,18 +853,20 @@ class Filter:
        3. Generate summary for the remaining middle messages.
        """
        try:
-            if self.valves.debug_mode:
+            await self._log(
-                print(f"\n[🤖 Async Summary Task] Starting...")
+                f"\n[🤖 Async Summary Task] Starting...", event_call=__event_call__
            )
            # 1. Get target compression progress
            # Prioritize getting from temp_state (calculated by inlet). If unavailable (e.g., after restart), assume current is full history.
            target_compressed_count = self.temp_state.pop(chat_id, None)
            if target_compressed_count is None:
                target_compressed_count = max(0, len(messages) - self.valves.keep_last)
-                if self.valves.debug_mode:
+                await self._log(
-                    print(
+                    f"[🤖 Async Summary Task] ⚠️ Could not get inlet state, estimating progress using current message count: {target_compressed_count}",
-                        f"[🤖 Async Summary Task] ⚠️ Could not get inlet state, estimating progress using current message count: {target_compressed_count}"
+                    type="warning",
-                    )
+                    event_call=__event_call__,
                )
            # 2. Determine the range of messages to compress (Middle)
            start_index = self.valves.keep_first
@@ -736,18 +876,18 @@ class Filter:
            # Ensure indices are valid
            if start_index >= end_index:
-                if self.valves.debug_mode:
+                await self._log(
-                    print(
+                    f"[🤖 Async Summary Task] Middle messages empty (Start: {start_index}, End: {end_index}), skipping",
-                        f"[🤖 Async Summary Task] Middle messages empty (Start: {start_index}, End: {end_index}), skipping"
+                    event_call=__event_call__,
-                    )
+                )
                return
            middle_messages = messages[start_index:end_index]
-            if self.valves.debug_mode:
+            await self._log(
-                print(
+                f"[🤖 Async Summary Task] Middle messages to process: {len(middle_messages)}",
-                    f"[🤖 Async Summary Task] Middle messages to process: {len(middle_messages)}"
+                event_call=__event_call__,
-                )
+            )
            # 3. Check Token limit and truncate (Max Context Truncation)
            # [Optimization] Use the summary model's (if any) threshold to decide how many middle messages can be processed
@@ -762,22 +902,26 @@ class Filter:
                "max_context_tokens", self.valves.max_context_tokens
            )
-            if self.valves.debug_mode:
+            await self._log(
-                print(
+                f"[🤖 Async Summary Task] Using max limit for model {summary_model_id}: {max_context_tokens} Tokens",
-                    f"[🤖 Async Summary Task] Using max limit for model {summary_model_id}: {max_context_tokens} Tokens"
+                event_call=__event_call__,
                )
            # Calculate current total Tokens (using summary model for counting)
            total_tokens = await asyncio.to_thread(
                self._calculate_messages_tokens, messages
            )
-            if total_tokens > max_context_tokens:
+            # Calculate tokens for middle messages only (plus buffer for prompt)
-                excess_tokens = total_tokens - max_context_tokens
+            # We only send middle_messages to the summary model, so we shouldn't count the full history against its limit.
-                if self.valves.debug_mode:
+            middle_tokens = await asyncio.to_thread(
-                    print(
+                self._calculate_messages_tokens, middle_messages
-                        f"[🤖 Async Summary Task] ⚠️ Total Tokens ({total_tokens}) exceed summary model limit ({max_context_tokens}), need to remove approx {excess_tokens} Tokens"
+            )
-                    )
+            # Add buffer for prompt and output (approx 2000 tokens)
            estimated_input_tokens = middle_tokens + 2000
            if estimated_input_tokens > max_context_tokens:
                excess_tokens = estimated_input_tokens - max_context_tokens
                await self._log(
                    f"[🤖 Async Summary Task] ⚠️ Middle messages ({middle_tokens} Tokens) + Buffer exceed summary model limit ({max_context_tokens}), need to remove approx {excess_tokens} Tokens",
                    type="warning",
                    event_call=__event_call__,
                )
                # Remove from the head of middle_messages
                removed_tokens = 0
@@ -785,20 +929,22 @@ class Filter:
                while removed_tokens < excess_tokens and middle_messages:
                    msg_to_remove = middle_messages.pop(0)
-                    msg_tokens = self._count_tokens(str(msg_to_remove.get("content", "")))
+                    msg_tokens = self._count_tokens(
                        str(msg_to_remove.get("content", ""))
                    )
                    removed_tokens += msg_tokens
                    removed_count += 1
-                if self.valves.debug_mode:
+                await self._log(
-                    print(
+                    f"[🤖 Async Summary Task] Removed {removed_count} messages, totaling {removed_tokens} Tokens",
-                        f"[🤖 Async Summary Task] Removed {removed_count} messages, totaling {removed_tokens} Tokens"
+                    event_call=__event_call__,
-                    )
+                )
            if not middle_messages:
-                if self.valves.debug_mode:
+                await self._log(
-                    print(
+                    f"[🤖 Async Summary Task] Middle messages empty after truncation, skipping summary generation",
-                        f"[🤖 Async Summary Task] Middle messages empty after truncation, skipping summary generation"
+                    event_call=__event_call__,
-                    )
+                )
                return
            # 4. Build conversation text
@@ -820,14 +966,14 @@ class Filter:
                )
            new_summary = await self._call_summary_llm(
-                None, conversation_text, body, user_data
+                None, conversation_text, body, user_data, __event_call__
            )
            # 6. Save new summary
-            if self.valves.debug_mode:
+            await self._log(
-                print(
+                "[Optimization] Saving summary in a background thread to avoid blocking the event loop.",
-                    "[Optimization] Saving summary in a background thread to avoid blocking the event loop."
+                event_call=__event_call__,
-                )
+            )
            await asyncio.to_thread(
                self._save_summary, chat_id, new_summary, target_compressed_count
@@ -845,16 +991,22 @@ class Filter:
                    }
                )
-            if self.valves.debug_mode:
+            await self._log(
-                print(
+                f"[🤖 Async Summary Task] ✅ Complete! New summary length: {len(new_summary)} characters",
-                    f"[🤖 Async Summary Task] ✅ Complete! New summary length: {len(new_summary)} characters"
+                type="success",
-                )
+                event_call=__event_call__,
-                print(
+            )
-                    f"[🤖 Async Summary Task] Progress update: Compressed up to original message {target_compressed_count}"
+            await self._log(
-                )
+                f"[🤖 Async Summary Task] Progress update: Compressed up to original message {target_compressed_count}",
                event_call=__event_call__,
            )
        except Exception as e:
-            print(f"[🤖 Async Summary Task] ❌ Error: {str(e)}")
+            await self._log(
                f"[🤖 Async Summary Task] ❌ Error: {str(e)}",
                type="error",
                event_call=__event_call__,
            )
            import traceback
            traceback.print_exc()
@@ -891,12 +1043,15 @@ class Filter:
        new_conversation_text: str,
        body: dict,
        user_data: dict,
        __event_call__: Callable[[Any], Awaitable[None]] = None,
    ) -> str:
        """
        Calls the LLM to generate a summary using Open WebUI's built-in method.
        """
-        if self.valves.debug_mode:
+        await self._log(
-            print(f"[🤖 LLM Call] Using Open WebUI's built-in method")
+            f"[🤖 LLM Call] Using Open WebUI's built-in method",
            event_call=__event_call__,
        )
        # Build summary prompt (Optimized)
        summary_prompt = f"""
@@ -935,8 +1090,7 @@ Based on the content above, generate the summary:
        # Determine the model to use
        model = self.valves.summary_model or body.get("model", "")
-        if self.valves.debug_mode:
+        await self._log(f"[🤖 LLM Call] Model: {model}", event_call=__event_call__)
            print(f"[🤖 LLM Call] Model: {model}")
        # Build payload
        payload = {
@@ -954,18 +1108,19 @@ Based on the content above, generate the summary:
                raise ValueError("Could not get user ID")
            # [Optimization] Get user object in a background thread to avoid blocking the event loop.
-            if self.valves.debug_mode:
+            await self._log(
-                print(
+                "[Optimization] Getting user object in a background thread to avoid blocking the event loop.",
-                    "[Optimization] Getting user object in a background thread to avoid blocking the event loop."
+                event_call=__event_call__,
-                )
+            )
            user = await asyncio.to_thread(Users.get_user_by_id, user_id)
            if not user:
                raise ValueError(f"Could not find user: {user_id}")
-            if self.valves.debug_mode:
+            await self._log(
-                print(f"[🤖 LLM Call] User: {user.email}")
+                f"[🤖 LLM Call] User: {user.email}\n[🤖 LLM Call] Sending request...",
-                print(f"[🤖 LLM Call] Sending request...")
+                event_call=__event_call__,
            )
            # Create Request object
            request = Request(scope={"type": "http", "app": webui_app})
@@ -978,8 +1133,11 @@ Based on the content above, generate the summary:
            summary = response["choices"][0]["message"]["content"].strip()
-            if self.valves.debug_mode:
+            await self._log(
-                print(f"[🤖 LLM Call] ✅ Successfully received summary")
+                f"[🤖 LLM Call] ✅ Successfully received summary",
                type="success",
                event_call=__event_call__,
            )
            return summary
@@ -991,7 +1149,10 @@ Based on the content above, generate the summary:
                    "If this is a pipeline (Pipe) model or an incompatible model, please specify a compatible summary model (e.g., 'gemini-2.5-flash') in the configuration."
                )
-            if self.valves.debug_mode:
+            await self._log(
-                print(f"[🤖 LLM Call] ❌ {error_message}")
+                f"[🤖 LLM Call] ❌ {error_message}",
                type="error",
                event_call=__event_call__,
            )
            raise Exception(error_message)
--- a/plugins/filters/async-context-compression/async_context_compression_cn.py
+++ b/plugins/filters/async-context-compression/async_context_compression_cn.py
@@ -5,7 +5,7 @@ author: Fu-Jie
 author_url: https://github.com/Fu-Jie
 funding_url: https://github.com/Fu-Jie/awesome-openwebui
 description: 通过智能摘要和消息压缩，降低长对话的 token 消耗，同时保持对话连贯性。
-version: 1.1.0
+version: 1.1.1
 openwebui_id: 5c0617cb-a9e4-4bd6-a440-d276534ebd18
 license: MIT
@@ -138,6 +138,10 @@ debug_mode (调试模式)
  默认: true
  说明: 在日志中打印详细的调试信息。生产环境建议设为 `false`。
 show_debug_log (前端调试日志)
  默认: false
  说明: 在浏览器控制台打印调试日志 (F12)。便于前端调试。
 🔧 部署配置
 ═══════════════════════════════════════════════════════
@@ -345,6 +349,9 @@ class Filter:
            default=0.1, ge=0.0, le=2.0, description="摘要生成的温度参数"
        )
        debug_mode: bool = Field(default=True, description="调试模式，打印详细日志")
        show_debug_log: bool = Field(
            default=False, description="在浏览器控制台打印调试日志 (F12)"
        )
    def _save_summary(self, chat_id: str, summary: str, compressed_count: int):
        """保存摘要到数据库"""
@@ -426,9 +433,7 @@ class Filter:
        # 回退策略：粗略估算 (1 token ≈ 4 chars)
        return len(text) // 4
-    def _calculate_messages_tokens(
+    def _calculate_messages_tokens(self, messages: List[Dict]) -> int:
        self, messages: List[Dict]
    ) -> int:
        """计算消息列表的总 Token 数"""
        total_tokens = 0
        for msg in messages:
@@ -502,12 +507,109 @@ class Filter:
        return message
    async def _emit_debug_log(
        self,
        __event_call__,
        chat_id: str,
        original_count: int,
        compressed_count: int,
        summary_length: int,
        kept_first: int,
        kept_last: int,
    ):
        """Emit debug log to browser console via JS execution"""
        if not self.valves.show_debug_log or not __event_call__:
            return
        try:
            # Prepare data for JS
            log_data = {
                "chatId": chat_id,
                "originalCount": original_count,
                "compressedCount": compressed_count,
                "summaryLength": summary_length,
                "keptFirst": kept_first,
                "keptLast": kept_last,
                "ratio": (
                    f"{(1 - compressed_count/original_count)*100:.1f}%"
                    if original_count > 0
                    else "0%"
                ),
            }
            # Construct JS code
            js_code = f"""
                (async function() {{
                    console.group("🗜️ Async Context Compression Debug");
                    console.log("Chat ID:", {json.dumps(chat_id)});
                    console.log("Messages:", {original_count} + " -> " + {compressed_count});
                    console.log("Compression Ratio:", {json.dumps(log_data['ratio'])});
                    console.log("Summary Length:", {summary_length} + " chars");
                    console.log("Configuration:", {{
                        "Keep First": {kept_first},
                        "Keep Last": {kept_last}
                    }});
                    console.groupEnd();
                }})();
            """
            await __event_call__(
                {
                    "type": "execute",
                    "data": {"code": js_code},
                }
            )
        except Exception as e:
            print(f"Error emitting debug log: {e}")
    async def _log(self, message: str, type: str = "info", event_call=None):
        """统一日志输出到后端 (print) 和前端 (console.log)"""
        # 后端日志
        if self.valves.debug_mode:
            print(message)
        # 前端日志
        if self.valves.show_debug_log and event_call:
            try:
                css = "color: #3b82f6;"  # 默认蓝色
                if type == "error":
                    css = "color: #ef4444; font-weight: bold;"  # 红色
                elif type == "warning":
                    css = "color: #f59e0b;"  # 橙色
                elif type == "success":
                    css = "color: #10b981; font-weight: bold;"  # 绿色
                # 清理前端消息：移除分隔符和多余换行
                lines = message.split("\n")
                # 保留不以大量等号或连字符开头的行
                filtered_lines = [
                    line
                    for line in lines
                    if not line.strip().startswith("====")
                    and not line.strip().startswith("----")
                ]
                clean_message = "\n".join(filtered_lines).strip()
                if not clean_message:
                    return
                # 转义消息中的引号和换行符
                safe_message = clean_message.replace('"', '\\"').replace("\n", "\\n")
                js_code = f"""
                    console.log("%c[压缩] {safe_message}", "{css}");
                """
                await event_call({"type": "execute", "data": {"code": js_code}})
            except Exception as e:
                print(f"发送前端日志失败: {e}")
    async def inlet(
        self,
        body: dict,
        __user__: Optional[dict] = None,
        __metadata__: dict = None,
        __event_emitter__: Callable[[Any], Awaitable[None]] = None,
        __event_call__: Callable[[Any], Awaitable[None]] = None,
    ) -> dict:
        """
        在发送到 LLM 之前执行
@@ -516,10 +618,11 @@ class Filter:
        messages = body.get("messages", [])
        chat_id = __metadata__["chat_id"]
-        if self.valves.debug_mode:
+        if self.valves.debug_mode or self.valves.show_debug_log:
-            print(f"\n{'='*60}")
+            await self._log(
-            print(f"[Inlet] Chat ID: {chat_id}")
+                f"\n{'='*60}\n[Inlet] Chat ID: {chat_id}\n[Inlet] 收到 {len(messages)} 条消息",
-            print(f"[Inlet] 收到 {len(messages)} 条消息")
+                event_call=__event_call__,
            )
        # 记录原始消息的目标压缩进度，供 outlet 使用
        # 目标是压缩到倒数第 keep_last 条之前
@@ -527,13 +630,18 @@ class Filter:
        # [优化] 简单的状态清理检查
        if chat_id in self.temp_state:
-            if self.valves.debug_mode:
+            await self._log(
-                print(f"[Inlet] ⚠️ 覆盖未消费的旧状态 (Chat ID: {chat_id})")
+                f"[Inlet] ⚠️ 覆盖未消费的旧状态 (Chat ID: {chat_id})",
                type="warning",
                event_call=__event_call__,
            )
        self.temp_state[chat_id] = target_compressed_count
-        if self.valves.debug_mode:
+        await self._log(
-            print(f"[Inlet] 记录目标压缩进度: {target_compressed_count}")
+            f"[Inlet] 记录目标压缩进度: {target_compressed_count}",
            event_call=__event_call__,
        )
        # 加载摘要记录
        summary_record = await asyncio.to_thread(self._load_summary_record, chat_id)
@@ -582,19 +690,32 @@ class Filter:
                    }
                )
-            if self.valves.debug_mode:
+            await self._log(
-                print(
+                f"[Inlet] 应用摘要: Head({len(head_messages)}) + Summary + Tail({len(tail_messages)})",
-                    f"[Inlet] 应用摘要: Head({len(head_messages)}) + Summary + Tail({len(tail_messages)})"
+                type="success",
-                )
+                event_call=__event_call__,
            )
            # Emit debug log to frontend (Keep the structured log as well)
            await self._emit_debug_log(
                __event_call__,
                chat_id,
                len(messages),
                len(final_messages),
                len(summary_record.summary),
                self.valves.keep_first,
                self.valves.keep_last,
            )
        else:
            # 没有摘要，使用原始消息
            final_messages = messages
        body["messages"] = final_messages
-        if self.valves.debug_mode:
+        await self._log(
-            print(f"[Inlet] 最终发送: {len(body['messages'])} 条消息")
+            f"[Inlet] 最终发送: {len(body['messages'])} 条消息\n{'='*60}\n",
-            print(f"{'='*60}\n")
+            event_call=__event_call__,
        )
        return body
@@ -604,6 +725,7 @@ class Filter:
        __user__: Optional[dict] = None,
        __metadata__: dict = None,
        __event_emitter__: Callable[[Any], Awaitable[None]] = None,
        __event_call__: Callable[[Any], Awaitable[None]] = None,
    ) -> dict:
        """
        在 LLM 响应完成后执行
@@ -612,21 +734,23 @@ class Filter:
        chat_id = __metadata__["chat_id"]
        model = body.get("model", "gpt-3.5-turbo")
-        if self.valves.debug_mode:
+        if self.valves.debug_mode or self.valves.show_debug_log:
-            print(f"\n{'='*60}")
+            await self._log(
-            print(f"[Outlet] Chat ID: {chat_id}")
+                f"\n{'='*60}\n[Outlet] Chat ID: {chat_id}\n[Outlet] 响应完成",
-            print(f"[Outlet] 响应完成")
+                event_call=__event_call__,
            )
        # 在后台异步处理 Token 计算和摘要生成（不等待完成，不影响输出）
        asyncio.create_task(
            self._check_and_generate_summary_async(
-                chat_id, model, body, __user__, __event_emitter__
+                chat_id, model, body, __user__, __event_emitter__, __event_call__
            )
        )
-        if self.valves.debug_mode:
+        await self._log(
-            print(f"[Outlet] 后台处理已启动")
+            f"[Outlet] 后台处理已启动\n{'='*60}\n",
-            print(f"{'='*60}\n")
+            event_call=__event_call__,
        )
        return body
@@ -637,6 +761,7 @@ class Filter:
        body: dict,
        user_data: Optional[dict],
        __event_emitter__: Callable[[Any], Awaitable[None]] = None,
        __event_call__: Callable[[Any], Awaitable[None]] = None,
    ):
        """
        后台处理：计算 Token 数并生成摘要（不阻塞响应）
@@ -650,36 +775,50 @@ class Filter:
                "compression_threshold_tokens", self.valves.compression_threshold_tokens
            )
-            if self.valves.debug_mode:
+            await self._log(
-                print(f"\n[🔍 后台计算] 开始 Token 计数...")
+                f"\n[🔍 后台计算] 开始 Token 计数...",
                event_call=__event_call__,
            )
            # 在后台线程中计算 Token 数
            current_tokens = await asyncio.to_thread(
                self._calculate_messages_tokens, messages
            )
-            if self.valves.debug_mode:
+            await self._log(
-                print(f"[🔍 后台计算] Token 数: {current_tokens}")
+                f"[🔍 后台计算] Token 数: {current_tokens}",
                event_call=__event_call__,
            )
            # 检查是否需要压缩
            if current_tokens >= compression_threshold_tokens:
-                if self.valves.debug_mode:
+                await self._log(
-                    print(
+                    f"[🔍 后台计算] ⚡ 触发压缩阈值 (Token: {current_tokens} >= {compression_threshold_tokens})",
-                        f"[🔍 后台计算] ⚡ 触发压缩阈值 (Token: {current_tokens} >= {compression_threshold_tokens})"
+                    type="warning",
-                    )
+                    event_call=__event_call__,
                )
                # 继续生成摘要
                await self._generate_summary_async(
-                    messages, chat_id, body, user_data, __event_emitter__
+                    messages,
                    chat_id,
                    body,
                    user_data,
                    __event_emitter__,
                    __event_call__,
                )
            else:
-                if self.valves.debug_mode:
+                await self._log(
-                    print(
+                    f"[🔍 后台计算] 未触发压缩阈值 (Token: {current_tokens} < {compression_threshold_tokens})",
-                        f"[🔍 后台计算] 未触发压缩阈值 (Token: {current_tokens} < {compression_threshold_tokens})"
+                    event_call=__event_call__,
-                    )
+                )
        except Exception as e:
-            print(f"[🔍 后台计算] ❌ 错误: {str(e)}")
+            await self._log(
                f"[🔍 后台计算] ❌ 错误: {str(e)}",
                type="error",
                event_call=__event_call__,
            )
    async def _generate_summary_async(
        self,
@@ -688,6 +827,7 @@ class Filter:
        body: dict,
        user_data: Optional[dict],
        __event_emitter__: Callable[[Any], Awaitable[None]] = None,
        __event_call__: Callable[[Any], Awaitable[None]] = None,
    ):
        """
        异步生成摘要（后台执行，不阻塞响应）
@@ -697,18 +837,18 @@ class Filter:
        3. 对剩余的中间消息生成摘要。
        """
        try:
-            if self.valves.debug_mode:
+            await self._log(f"\n[🤖 异步摘要任务] 开始...", event_call=__event_call__)
                print(f"\n[🤖 异步摘要任务] 开始...")
            # 1. 获取目标压缩进度
            # 优先从 temp_state 获取（由 inlet 计算），如果获取不到（例如重启后），则假设当前是完整历史
            target_compressed_count = self.temp_state.pop(chat_id, None)
            if target_compressed_count is None:
                target_compressed_count = max(0, len(messages) - self.valves.keep_last)
-                if self.valves.debug_mode:
+                await self._log(
-                    print(
+                    f"[🤖 异步摘要任务] ⚠️ 无法获取 inlet 状态，使用当前消息数估算进度: {target_compressed_count}",
-                        f"[🤖 异步摘要任务] ⚠️ 无法获取 inlet 状态，使用当前消息数估算进度: {target_compressed_count}"
+                    type="warning",
-                    )
+                    event_call=__event_call__,
                )
            # 2. 确定待压缩的消息范围 (Middle)
            start_index = self.valves.keep_first
@@ -718,16 +858,18 @@ class Filter:
            # 确保索引有效
            if start_index >= end_index:
-                if self.valves.debug_mode:
+                await self._log(
-                    print(
+                    f"[🤖 异步摘要任务] 中间消息为空 (Start: {start_index}, End: {end_index})，跳过",
-                        f"[🤖 异步摘要任务] 中间消息为空 (Start: {start_index}, End: {end_index})，跳过"
+                    event_call=__event_call__,
-                    )
+                )
                return
            middle_messages = messages[start_index:end_index]
-            if self.valves.debug_mode:
+            await self._log(
-                print(f"[🤖 异步摘要任务] 待处理中间消息: {len(middle_messages)} 条")
+                f"[🤖 异步摘要任务] 待处理中间消息: {len(middle_messages)} 条",
                event_call=__event_call__,
            )
            # 3. 检查 Token 上限并截断 (Max Context Truncation)
            # [优化] 使用摘要模型(如果有)的阈值来决定能处理多少中间消息
@@ -740,22 +882,26 @@ class Filter:
                "max_context_tokens", self.valves.max_context_tokens
            )
-            if self.valves.debug_mode:
+            await self._log(
-                print(
+                f"[🤖 异步摘要任务] 使用模型 {summary_model_id} 的上限: {max_context_tokens} Tokens",
-                    f"[🤖 异步摘要任务] 使用模型 {summary_model_id} 的上限: {max_context_tokens} Tokens"
+                event_call=__event_call__,
                )
            # 计算当前总 Token (使用摘要模型进行计数)
            total_tokens = await asyncio.to_thread(
                self._calculate_messages_tokens, messages
            )
-            if total_tokens > max_context_tokens:
+            # 计算中间消息的 Token (加上提示词的缓冲)
-                excess_tokens = total_tokens - max_context_tokens
+            # 我们只把 middle_messages 发送给摘要模型，所以不应该把完整历史计入限制
-                if self.valves.debug_mode:
+            middle_tokens = await asyncio.to_thread(
-                    print(
+                self._calculate_messages_tokens, middle_messages
-                        f"[🤖 异步摘要任务] ⚠️ 总 Token ({total_tokens}) 超过摘要模型上限 ({max_context_tokens})，需要移除约 {excess_tokens} Token"
+            )
-                    )
+            # 增加提示词和输出的缓冲 (约 2000 Tokens)
            estimated_input_tokens = middle_tokens + 2000
            if estimated_input_tokens > max_context_tokens:
                excess_tokens = estimated_input_tokens - max_context_tokens
                await self._log(
                    f"[🤖 异步摘要任务] ⚠️ 中间消息 ({middle_tokens} Tokens) + 缓冲超过摘要模型上限 ({max_context_tokens})，需要移除约 {excess_tokens} Token",
                    type="warning",
                    event_call=__event_call__,
                )
                # 从 middle_messages 头部开始移除
                removed_tokens = 0
@@ -769,14 +915,16 @@ class Filter:
                    removed_tokens += msg_tokens
                    removed_count += 1
-                if self.valves.debug_mode:
+                await self._log(
-                    print(
+                    f"[🤖 异步摘要任务] 已移除 {removed_count} 条消息，共 {removed_tokens} Token",
-                        f"[🤖 异步摘要任务] 已移除 {removed_count} 条消息，共 {removed_tokens} Token"
+                    event_call=__event_call__,
-                    )
+                )
            if not middle_messages:
-                if self.valves.debug_mode:
+                await self._log(
-                    print(f"[🤖 异步摘要任务] 截断后中间消息为空，跳过摘要生成")
+                    f"[🤖 异步摘要任务] 截断后中间消息为空，跳过摘要生成",
                    event_call=__event_call__,
                )
                return
            # 4. 构建对话文本
@@ -798,12 +946,14 @@ class Filter:
                )
            new_summary = await self._call_summary_llm(
-                None, conversation_text, body, user_data
+                None, conversation_text, body, user_data, __event_call__
            )
            # 6. 保存新摘要
-            if self.valves.debug_mode:
+            await self._log(
-                print("[优化] 正在后台线程中保存摘要，以避免阻塞事件循环。")
+                "[优化] 在后台线程中保存摘要以避免阻塞事件循环。",
                event_call=__event_call__,
            )
            await asyncio.to_thread(
                self._save_summary, chat_id, new_summary, target_compressed_count
@@ -815,32 +965,40 @@ class Filter:
                    {
                        "type": "status",
                        "data": {
-                            "description": f"上下文摘要已更新 (已压缩 {len(middle_messages)} 条消息)",
+                            "description": f"上下文摘要已更新 (压缩了 {len(middle_messages)} 条消息)",
                            "done": True,
                        },
                    }
                )
-            if self.valves.debug_mode:
+            await self._log(
-                print(f"[🤖 异步摘要任务] ✅ 完成！新摘要长度: {len(new_summary)} 字符")
+                f"[🤖 异步摘要任务] ✅ 完成！新摘要长度: {len(new_summary)} 字符",
-                print(
+                type="success",
-                    f"[🤖 异步摘要任务] 进度更新: 已压缩至原始第 {target_compressed_count} 条消息"
+                event_call=__event_call__,
-                )
+            )
            await self._log(
                f"[🤖 异步摘要任务] 进度更新: 已压缩至原始消息 {target_compressed_count}",
                event_call=__event_call__,
            )
        except Exception as e:
-            print(f"[🤖 异步摘要任务] ❌ 错误: {str(e)}")
+            await self._log(
                f"[🤖 异步摘要任务] ❌ 错误: {str(e)}",
                type="error",
                event_call=__event_call__,
            )
            import traceback
            traceback.print_exc()
    def _format_messages_for_summary(self, messages: list) -> str:
-        """格式化消息用于摘要"""
+        """Formats messages for summarization."""
        formatted = []
        for i, msg in enumerate(messages, 1):
            role = msg.get("role", "unknown")
            content = msg.get("content", "")
-            # 处理多模态内容
+            # Handle multimodal content
            if isinstance(content, list):
                text_parts = []
                for part in content:
@@ -848,10 +1006,10 @@ class Filter:
                        text_parts.append(part.get("text", ""))
                content = " ".join(text_parts)
-            # 处理角色名称
+            # Handle role name
-            role_name = {"user": "用户", "assistant": "助手"}.get(role, role)
+            role_name = {"user": "User", "assistant": "Assistant"}.get(role, role)
-            # 限制每条消息的长度，避免过长
+            # Limit length of each message to avoid excessive length
            if len(content) > 500:
                content = content[:500] + "..."
@@ -865,12 +1023,15 @@ class Filter:
        new_conversation_text: str,
        body: dict,
        user_data: dict,
        __event_call__: Callable[[Any], Awaitable[None]] = None,
    ) -> str:
        """
-        使用 Open WebUI 内置方法调用 LLM 生成摘要
+        调用 LLM 生成摘要，使用 Open Web UI 的内置方法。
        """
-        if self.valves.debug_mode:
+        await self._log(
-            print(f"[🤖 LLM 调用] 使用 Open WebUI 内置方法")
+            f"[🤖 LLM 调用] 使用 Open Web UI 内置方法",
            event_call=__event_call__,
        )
        # 构建摘要提示词 (优化版)
        summary_prompt = f"""
@@ -909,8 +1070,7 @@ class Filter:
        # 确定使用的模型
        model = self.valves.summary_model or body.get("model", "")
-        if self.valves.debug_mode:
+        await self._log(f"[🤖 LLM 调用] 模型: {model}", event_call=__event_call__)
            print(f"[🤖 LLM 调用] 模型: {model}")
        # 构建 payload
        payload = {
@@ -927,17 +1087,20 @@ class Filter:
            if not user_id:
                raise ValueError("无法获取用户 ID")
-            # [优化] 在后台线程中获取用户对象，以避免阻塞事件循环
+            # [优化] 在后台线程中获取用户对象以避免阻塞事件循环
-            if self.valves.debug_mode:
+            await self._log(
-                print("[优化] 正在后台线程中获取用户对象，以避免阻塞事件循环。")
+                "[优化] 在后台线程中获取用户对象以避免阻塞事件循环。",
                event_call=__event_call__,
            )
            user = await asyncio.to_thread(Users.get_user_by_id, user_id)
            if not user:
                raise ValueError(f"无法找到用户: {user_id}")
-            if self.valves.debug_mode:
+            await self._log(
-                print(f"[🤖 LLM 调用] 用户: {user.email}")
+                f"[🤖 LLM 调用] 用户: {user.email}\n[🤖 LLM 调用] 发送请求...",
-                print(f"[🤖 LLM 调用] 发送请求...")
+                event_call=__event_call__,
            )
            # 创建 Request 对象
            request = Request(scope={"type": "http", "app": webui_app})
@@ -950,8 +1113,11 @@ class Filter:
            summary = response["choices"][0]["message"]["content"].strip()
-            if self.valves.debug_mode:
+            await self._log(
-                print(f"[🤖 LLM 调用] ✅ 成功获取摘要")
+                f"[🤖 LLM 调用] ✅ 成功接收摘要",
                type="success",
                event_call=__event_call__,
            )
            return summary
@@ -959,11 +1125,14 @@ class Filter:
            error_message = f"调用 LLM ({model}) 生成摘要时发生错误: {str(e)}"
            if not self.valves.summary_model:
                error_message += (
-                    "\n[提示] 您没有指定摘要模型 (summary_model)，因此尝试使用当前对话的模型。"
+                    "\n[提示] 您未指定 summary_model，因此过滤器尝试使用当前对话的模型。"
-                    "如果这是一个流水线（Pipe）模型或不兼容的模型，请在配置中指定一个兼容的摘要模型（如 'gemini-2.5-flash'）。"
+                    "如果这是流水线 (Pipe) 模型或不兼容的模型，请在配置中指定兼容的摘要模型 (例如 'gemini-2.5-flash')。"
                )
-            if self.valves.debug_mode:
+            await self._log(
-                print(f"[🤖 LLM 调用] ❌ {error_message}")
+                f"[🤖 LLM 调用] ❌ {error_message}",
                type="error",
                event_call=__event_call__,
            )
            raise Exception(error_message)