Update Async Context Compression to v1.1.1: Add frontend debug logging and optimize token calculation
This commit is contained in:
@@ -5,7 +5,7 @@ author: Fu-Jie
|
|||||||
author_url: https://github.com/Fu-Jie
|
author_url: https://github.com/Fu-Jie
|
||||||
funding_url: https://github.com/Fu-Jie/awesome-openwebui
|
funding_url: https://github.com/Fu-Jie/awesome-openwebui
|
||||||
description: Reduces token consumption in long conversations while maintaining coherence through intelligent summarization and message compression.
|
description: Reduces token consumption in long conversations while maintaining coherence through intelligent summarization and message compression.
|
||||||
version: 1.1.0
|
version: 1.1.1
|
||||||
openwebui_id: b1655bc8-6de9-4cad-8cb5-a6f7829a02ce
|
openwebui_id: b1655bc8-6de9-4cad-8cb5-a6f7829a02ce
|
||||||
license: MIT
|
license: MIT
|
||||||
|
|
||||||
@@ -139,6 +139,10 @@ debug_mode
|
|||||||
Default: true
|
Default: true
|
||||||
Description: Prints detailed debug information to the log. Recommended to set to `false` in production.
|
Description: Prints detailed debug information to the log. Recommended to set to `false` in production.
|
||||||
|
|
||||||
|
show_debug_log
|
||||||
|
Default: false
|
||||||
|
Description: Print debug logs to browser console (F12). Useful for frontend debugging.
|
||||||
|
|
||||||
🔧 Deployment
|
🔧 Deployment
|
||||||
═══════════════════════════════════════════════════════
|
═══════════════════════════════════════════════════════
|
||||||
|
|
||||||
@@ -355,6 +359,9 @@ class Filter:
|
|||||||
debug_mode: bool = Field(
|
debug_mode: bool = Field(
|
||||||
default=True, description="Enable detailed logging for debugging."
|
default=True, description="Enable detailed logging for debugging."
|
||||||
)
|
)
|
||||||
|
show_debug_log: bool = Field(
|
||||||
|
default=False, description="Print debug logs to browser console (F12)"
|
||||||
|
)
|
||||||
|
|
||||||
def _save_summary(self, chat_id: str, summary: str, compressed_count: int):
|
def _save_summary(self, chat_id: str, summary: str, compressed_count: int):
|
||||||
"""Saves the summary to the database."""
|
"""Saves the summary to the database."""
|
||||||
@@ -516,12 +523,109 @@ class Filter:
|
|||||||
|
|
||||||
return message
|
return message
|
||||||
|
|
||||||
|
async def _emit_debug_log(
|
||||||
|
self,
|
||||||
|
__event_call__,
|
||||||
|
chat_id: str,
|
||||||
|
original_count: int,
|
||||||
|
compressed_count: int,
|
||||||
|
summary_length: int,
|
||||||
|
kept_first: int,
|
||||||
|
kept_last: int,
|
||||||
|
):
|
||||||
|
"""Emit debug log to browser console via JS execution"""
|
||||||
|
if not self.valves.show_debug_log or not __event_call__:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Prepare data for JS
|
||||||
|
log_data = {
|
||||||
|
"chatId": chat_id,
|
||||||
|
"originalCount": original_count,
|
||||||
|
"compressedCount": compressed_count,
|
||||||
|
"summaryLength": summary_length,
|
||||||
|
"keptFirst": kept_first,
|
||||||
|
"keptLast": kept_last,
|
||||||
|
"ratio": (
|
||||||
|
f"{(1 - compressed_count/original_count)*100:.1f}%"
|
||||||
|
if original_count > 0
|
||||||
|
else "0%"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Construct JS code
|
||||||
|
js_code = f"""
|
||||||
|
(async function() {{
|
||||||
|
console.group("🗜️ Async Context Compression Debug");
|
||||||
|
console.log("Chat ID:", {json.dumps(chat_id)});
|
||||||
|
console.log("Messages:", {original_count} + " -> " + {compressed_count});
|
||||||
|
console.log("Compression Ratio:", {json.dumps(log_data['ratio'])});
|
||||||
|
console.log("Summary Length:", {summary_length} + " chars");
|
||||||
|
console.log("Configuration:", {{
|
||||||
|
"Keep First": {kept_first},
|
||||||
|
"Keep Last": {kept_last}
|
||||||
|
}});
|
||||||
|
console.groupEnd();
|
||||||
|
}})();
|
||||||
|
"""
|
||||||
|
|
||||||
|
await __event_call__(
|
||||||
|
{
|
||||||
|
"type": "execute",
|
||||||
|
"data": {"code": js_code},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error emitting debug log: {e}")
|
||||||
|
|
||||||
|
async def _log(self, message: str, type: str = "info", event_call=None):
|
||||||
|
"""Unified logging to both backend (print) and frontend (console.log)"""
|
||||||
|
# Backend logging
|
||||||
|
if self.valves.debug_mode:
|
||||||
|
print(message)
|
||||||
|
|
||||||
|
# Frontend logging
|
||||||
|
if self.valves.show_debug_log and event_call:
|
||||||
|
try:
|
||||||
|
css = "color: #3b82f6;" # Blue default
|
||||||
|
if type == "error":
|
||||||
|
css = "color: #ef4444; font-weight: bold;" # Red
|
||||||
|
elif type == "warning":
|
||||||
|
css = "color: #f59e0b;" # Orange
|
||||||
|
elif type == "success":
|
||||||
|
css = "color: #10b981; font-weight: bold;" # Green
|
||||||
|
|
||||||
|
# Clean message for frontend: remove separators and extra newlines
|
||||||
|
lines = message.split("\n")
|
||||||
|
# Keep lines that don't start with lots of equals or hyphens
|
||||||
|
filtered_lines = [
|
||||||
|
line
|
||||||
|
for line in lines
|
||||||
|
if not line.strip().startswith("====")
|
||||||
|
and not line.strip().startswith("----")
|
||||||
|
]
|
||||||
|
clean_message = "\n".join(filtered_lines).strip()
|
||||||
|
|
||||||
|
if not clean_message:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Escape quotes in message for JS string
|
||||||
|
safe_message = clean_message.replace('"', '\\"').replace("\n", "\\n")
|
||||||
|
|
||||||
|
js_code = f"""
|
||||||
|
console.log("%c[Compression] {safe_message}", "{css}");
|
||||||
|
"""
|
||||||
|
await event_call({"type": "execute", "data": {"code": js_code}})
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to emit log to frontend: {e}")
|
||||||
|
|
||||||
async def inlet(
|
async def inlet(
|
||||||
self,
|
self,
|
||||||
body: dict,
|
body: dict,
|
||||||
__user__: Optional[dict] = None,
|
__user__: Optional[dict] = None,
|
||||||
__metadata__: dict = None,
|
__metadata__: dict = None,
|
||||||
__event_emitter__: Callable[[Any], Awaitable[None]] = None,
|
__event_emitter__: Callable[[Any], Awaitable[None]] = None,
|
||||||
|
__event_call__: Callable[[Any], Awaitable[None]] = None,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""
|
"""
|
||||||
Executed before sending to the LLM.
|
Executed before sending to the LLM.
|
||||||
@@ -530,10 +634,11 @@ class Filter:
|
|||||||
messages = body.get("messages", [])
|
messages = body.get("messages", [])
|
||||||
chat_id = __metadata__["chat_id"]
|
chat_id = __metadata__["chat_id"]
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
if self.valves.debug_mode or self.valves.show_debug_log:
|
||||||
print(f"\n{'='*60}")
|
await self._log(
|
||||||
print(f"[Inlet] Chat ID: {chat_id}")
|
f"\n{'='*60}\n[Inlet] Chat ID: {chat_id}\n[Inlet] Received {len(messages)} messages",
|
||||||
print(f"[Inlet] Received {len(messages)} messages")
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# Record the target compression progress for the original messages, for use in outlet
|
# Record the target compression progress for the original messages, for use in outlet
|
||||||
# Target is to compress up to the (total - keep_last) message
|
# Target is to compress up to the (total - keep_last) message
|
||||||
@@ -541,17 +646,18 @@ class Filter:
|
|||||||
|
|
||||||
# [Optimization] Simple state cleanup check
|
# [Optimization] Simple state cleanup check
|
||||||
if chat_id in self.temp_state:
|
if chat_id in self.temp_state:
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[Inlet] ⚠️ Overwriting unconsumed old state (Chat ID: {chat_id})",
|
||||||
f"[Inlet] ⚠️ Overwriting unconsumed old state (Chat ID: {chat_id})"
|
type="warning",
|
||||||
)
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
self.temp_state[chat_id] = target_compressed_count
|
self.temp_state[chat_id] = target_compressed_count
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[Inlet] Recorded target compression progress: {target_compressed_count}",
|
||||||
f"[Inlet] Recorded target compression progress: {target_compressed_count}"
|
event_call=__event_call__,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Load summary record
|
# Load summary record
|
||||||
summary_record = await asyncio.to_thread(self._load_summary_record, chat_id)
|
summary_record = await asyncio.to_thread(self._load_summary_record, chat_id)
|
||||||
@@ -600,19 +706,32 @@ class Filter:
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[Inlet] Applied summary: Head({len(head_messages)}) + Summary + Tail({len(tail_messages)})",
|
||||||
f"[Inlet] Applied summary: Head({len(head_messages)}) + Summary + Tail({len(tail_messages)})"
|
type="success",
|
||||||
)
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Emit debug log to frontend (Keep the structured log as well)
|
||||||
|
await self._emit_debug_log(
|
||||||
|
__event_call__,
|
||||||
|
chat_id,
|
||||||
|
len(messages),
|
||||||
|
len(final_messages),
|
||||||
|
len(summary_record.summary),
|
||||||
|
self.valves.keep_first,
|
||||||
|
self.valves.keep_last,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# No summary, use original messages
|
# No summary, use original messages
|
||||||
final_messages = messages
|
final_messages = messages
|
||||||
|
|
||||||
body["messages"] = final_messages
|
body["messages"] = final_messages
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[Inlet] Final send: {len(body['messages'])} messages")
|
f"[Inlet] Final send: {len(body['messages'])} messages\n{'='*60}\n",
|
||||||
print(f"{'='*60}\n")
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
return body
|
return body
|
||||||
|
|
||||||
@@ -622,6 +741,7 @@ class Filter:
|
|||||||
__user__: Optional[dict] = None,
|
__user__: Optional[dict] = None,
|
||||||
__metadata__: dict = None,
|
__metadata__: dict = None,
|
||||||
__event_emitter__: Callable[[Any], Awaitable[None]] = None,
|
__event_emitter__: Callable[[Any], Awaitable[None]] = None,
|
||||||
|
__event_call__: Callable[[Any], Awaitable[None]] = None,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""
|
"""
|
||||||
Executed after the LLM response is complete.
|
Executed after the LLM response is complete.
|
||||||
@@ -630,21 +750,23 @@ class Filter:
|
|||||||
chat_id = __metadata__["chat_id"]
|
chat_id = __metadata__["chat_id"]
|
||||||
model = body.get("model", "gpt-3.5-turbo")
|
model = body.get("model", "gpt-3.5-turbo")
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
if self.valves.debug_mode or self.valves.show_debug_log:
|
||||||
print(f"\n{'='*60}")
|
await self._log(
|
||||||
print(f"[Outlet] Chat ID: {chat_id}")
|
f"\n{'='*60}\n[Outlet] Chat ID: {chat_id}\n[Outlet] Response complete",
|
||||||
print(f"[Outlet] Response complete")
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# Process Token calculation and summary generation asynchronously in the background (do not wait for completion, do not affect output)
|
# Process Token calculation and summary generation asynchronously in the background (do not wait for completion, do not affect output)
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
self._check_and_generate_summary_async(
|
self._check_and_generate_summary_async(
|
||||||
chat_id, model, body, __user__, __event_emitter__
|
chat_id, model, body, __user__, __event_emitter__, __event_call__
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[Outlet] Background processing started")
|
f"[Outlet] Background processing started\n{'='*60}\n",
|
||||||
print(f"{'='*60}\n")
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
return body
|
return body
|
||||||
|
|
||||||
@@ -655,6 +777,7 @@ class Filter:
|
|||||||
body: dict,
|
body: dict,
|
||||||
user_data: Optional[dict],
|
user_data: Optional[dict],
|
||||||
__event_emitter__: Callable[[Any], Awaitable[None]] = None,
|
__event_emitter__: Callable[[Any], Awaitable[None]] = None,
|
||||||
|
__event_call__: Callable[[Any], Awaitable[None]] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Background processing: Calculates Token count and generates summary (does not block response).
|
Background processing: Calculates Token count and generates summary (does not block response).
|
||||||
@@ -668,36 +791,50 @@ class Filter:
|
|||||||
"compression_threshold_tokens", self.valves.compression_threshold_tokens
|
"compression_threshold_tokens", self.valves.compression_threshold_tokens
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"\n[🔍 Background Calculation] Starting Token count...")
|
f"\n[🔍 Background Calculation] Starting Token count...",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# Calculate Token count in a background thread
|
# Calculate Token count in a background thread
|
||||||
current_tokens = await asyncio.to_thread(
|
current_tokens = await asyncio.to_thread(
|
||||||
self._calculate_messages_tokens, messages
|
self._calculate_messages_tokens, messages
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[🔍 Background Calculation] Token count: {current_tokens}")
|
f"[🔍 Background Calculation] Token count: {current_tokens}",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# Check if compression is needed
|
# Check if compression is needed
|
||||||
if current_tokens >= compression_threshold_tokens:
|
if current_tokens >= compression_threshold_tokens:
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[🔍 Background Calculation] ⚡ Compression threshold triggered (Token: {current_tokens} >= {compression_threshold_tokens})",
|
||||||
f"[🔍 Background Calculation] ⚡ Compression threshold triggered (Token: {current_tokens} >= {compression_threshold_tokens})"
|
type="warning",
|
||||||
)
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# Proceed to generate summary
|
# Proceed to generate summary
|
||||||
await self._generate_summary_async(
|
await self._generate_summary_async(
|
||||||
messages, chat_id, body, user_data, __event_emitter__
|
messages,
|
||||||
|
chat_id,
|
||||||
|
body,
|
||||||
|
user_data,
|
||||||
|
__event_emitter__,
|
||||||
|
__event_call__,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[🔍 Background Calculation] Compression threshold not reached (Token: {current_tokens} < {compression_threshold_tokens})",
|
||||||
f"[🔍 Background Calculation] Compression threshold not reached (Token: {current_tokens} < {compression_threshold_tokens})"
|
event_call=__event_call__,
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[🔍 Background Calculation] ❌ Error: {str(e)}")
|
await self._log(
|
||||||
|
f"[🔍 Background Calculation] ❌ Error: {str(e)}",
|
||||||
|
type="error",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
async def _generate_summary_async(
|
async def _generate_summary_async(
|
||||||
self,
|
self,
|
||||||
@@ -706,6 +843,7 @@ class Filter:
|
|||||||
body: dict,
|
body: dict,
|
||||||
user_data: Optional[dict],
|
user_data: Optional[dict],
|
||||||
__event_emitter__: Callable[[Any], Awaitable[None]] = None,
|
__event_emitter__: Callable[[Any], Awaitable[None]] = None,
|
||||||
|
__event_call__: Callable[[Any], Awaitable[None]] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Generates summary asynchronously (runs in background, does not block response).
|
Generates summary asynchronously (runs in background, does not block response).
|
||||||
@@ -715,18 +853,20 @@ class Filter:
|
|||||||
3. Generate summary for the remaining middle messages.
|
3. Generate summary for the remaining middle messages.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"\n[🤖 Async Summary Task] Starting...")
|
f"\n[🤖 Async Summary Task] Starting...", event_call=__event_call__
|
||||||
|
)
|
||||||
|
|
||||||
# 1. Get target compression progress
|
# 1. Get target compression progress
|
||||||
# Prioritize getting from temp_state (calculated by inlet). If unavailable (e.g., after restart), assume current is full history.
|
# Prioritize getting from temp_state (calculated by inlet). If unavailable (e.g., after restart), assume current is full history.
|
||||||
target_compressed_count = self.temp_state.pop(chat_id, None)
|
target_compressed_count = self.temp_state.pop(chat_id, None)
|
||||||
if target_compressed_count is None:
|
if target_compressed_count is None:
|
||||||
target_compressed_count = max(0, len(messages) - self.valves.keep_last)
|
target_compressed_count = max(0, len(messages) - self.valves.keep_last)
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[🤖 Async Summary Task] ⚠️ Could not get inlet state, estimating progress using current message count: {target_compressed_count}",
|
||||||
f"[🤖 Async Summary Task] ⚠️ Could not get inlet state, estimating progress using current message count: {target_compressed_count}"
|
type="warning",
|
||||||
)
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# 2. Determine the range of messages to compress (Middle)
|
# 2. Determine the range of messages to compress (Middle)
|
||||||
start_index = self.valves.keep_first
|
start_index = self.valves.keep_first
|
||||||
@@ -736,18 +876,18 @@ class Filter:
|
|||||||
|
|
||||||
# Ensure indices are valid
|
# Ensure indices are valid
|
||||||
if start_index >= end_index:
|
if start_index >= end_index:
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[🤖 Async Summary Task] Middle messages empty (Start: {start_index}, End: {end_index}), skipping",
|
||||||
f"[🤖 Async Summary Task] Middle messages empty (Start: {start_index}, End: {end_index}), skipping"
|
event_call=__event_call__,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
middle_messages = messages[start_index:end_index]
|
middle_messages = messages[start_index:end_index]
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[🤖 Async Summary Task] Middle messages to process: {len(middle_messages)}",
|
||||||
f"[🤖 Async Summary Task] Middle messages to process: {len(middle_messages)}"
|
event_call=__event_call__,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 3. Check Token limit and truncate (Max Context Truncation)
|
# 3. Check Token limit and truncate (Max Context Truncation)
|
||||||
# [Optimization] Use the summary model's (if any) threshold to decide how many middle messages can be processed
|
# [Optimization] Use the summary model's (if any) threshold to decide how many middle messages can be processed
|
||||||
@@ -762,22 +902,26 @@ class Filter:
|
|||||||
"max_context_tokens", self.valves.max_context_tokens
|
"max_context_tokens", self.valves.max_context_tokens
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[🤖 Async Summary Task] Using max limit for model {summary_model_id}: {max_context_tokens} Tokens",
|
||||||
f"[🤖 Async Summary Task] Using max limit for model {summary_model_id}: {max_context_tokens} Tokens"
|
event_call=__event_call__,
|
||||||
)
|
|
||||||
|
|
||||||
# Calculate current total Tokens (using summary model for counting)
|
|
||||||
total_tokens = await asyncio.to_thread(
|
|
||||||
self._calculate_messages_tokens, messages
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if total_tokens > max_context_tokens:
|
# Calculate tokens for middle messages only (plus buffer for prompt)
|
||||||
excess_tokens = total_tokens - max_context_tokens
|
# We only send middle_messages to the summary model, so we shouldn't count the full history against its limit.
|
||||||
if self.valves.debug_mode:
|
middle_tokens = await asyncio.to_thread(
|
||||||
print(
|
self._calculate_messages_tokens, middle_messages
|
||||||
f"[🤖 Async Summary Task] ⚠️ Total Tokens ({total_tokens}) exceed summary model limit ({max_context_tokens}), need to remove approx {excess_tokens} Tokens"
|
)
|
||||||
)
|
# Add buffer for prompt and output (approx 2000 tokens)
|
||||||
|
estimated_input_tokens = middle_tokens + 2000
|
||||||
|
|
||||||
|
if estimated_input_tokens > max_context_tokens:
|
||||||
|
excess_tokens = estimated_input_tokens - max_context_tokens
|
||||||
|
await self._log(
|
||||||
|
f"[🤖 Async Summary Task] ⚠️ Middle messages ({middle_tokens} Tokens) + Buffer exceed summary model limit ({max_context_tokens}), need to remove approx {excess_tokens} Tokens",
|
||||||
|
type="warning",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# Remove from the head of middle_messages
|
# Remove from the head of middle_messages
|
||||||
removed_tokens = 0
|
removed_tokens = 0
|
||||||
@@ -785,20 +929,22 @@ class Filter:
|
|||||||
|
|
||||||
while removed_tokens < excess_tokens and middle_messages:
|
while removed_tokens < excess_tokens and middle_messages:
|
||||||
msg_to_remove = middle_messages.pop(0)
|
msg_to_remove = middle_messages.pop(0)
|
||||||
msg_tokens = self._count_tokens(str(msg_to_remove.get("content", "")))
|
msg_tokens = self._count_tokens(
|
||||||
|
str(msg_to_remove.get("content", ""))
|
||||||
|
)
|
||||||
removed_tokens += msg_tokens
|
removed_tokens += msg_tokens
|
||||||
removed_count += 1
|
removed_count += 1
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[🤖 Async Summary Task] Removed {removed_count} messages, totaling {removed_tokens} Tokens",
|
||||||
f"[🤖 Async Summary Task] Removed {removed_count} messages, totaling {removed_tokens} Tokens"
|
event_call=__event_call__,
|
||||||
)
|
)
|
||||||
|
|
||||||
if not middle_messages:
|
if not middle_messages:
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[🤖 Async Summary Task] Middle messages empty after truncation, skipping summary generation",
|
||||||
f"[🤖 Async Summary Task] Middle messages empty after truncation, skipping summary generation"
|
event_call=__event_call__,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
# 4. Build conversation text
|
# 4. Build conversation text
|
||||||
@@ -820,14 +966,14 @@ class Filter:
|
|||||||
)
|
)
|
||||||
|
|
||||||
new_summary = await self._call_summary_llm(
|
new_summary = await self._call_summary_llm(
|
||||||
None, conversation_text, body, user_data
|
None, conversation_text, body, user_data, __event_call__
|
||||||
)
|
)
|
||||||
|
|
||||||
# 6. Save new summary
|
# 6. Save new summary
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
"[Optimization] Saving summary in a background thread to avoid blocking the event loop.",
|
||||||
"[Optimization] Saving summary in a background thread to avoid blocking the event loop."
|
event_call=__event_call__,
|
||||||
)
|
)
|
||||||
|
|
||||||
await asyncio.to_thread(
|
await asyncio.to_thread(
|
||||||
self._save_summary, chat_id, new_summary, target_compressed_count
|
self._save_summary, chat_id, new_summary, target_compressed_count
|
||||||
@@ -845,16 +991,22 @@ class Filter:
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[🤖 Async Summary Task] ✅ Complete! New summary length: {len(new_summary)} characters",
|
||||||
f"[🤖 Async Summary Task] ✅ Complete! New summary length: {len(new_summary)} characters"
|
type="success",
|
||||||
)
|
event_call=__event_call__,
|
||||||
print(
|
)
|
||||||
f"[🤖 Async Summary Task] Progress update: Compressed up to original message {target_compressed_count}"
|
await self._log(
|
||||||
)
|
f"[🤖 Async Summary Task] Progress update: Compressed up to original message {target_compressed_count}",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[🤖 Async Summary Task] ❌ Error: {str(e)}")
|
await self._log(
|
||||||
|
f"[🤖 Async Summary Task] ❌ Error: {str(e)}",
|
||||||
|
type="error",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
@@ -891,12 +1043,15 @@ class Filter:
|
|||||||
new_conversation_text: str,
|
new_conversation_text: str,
|
||||||
body: dict,
|
body: dict,
|
||||||
user_data: dict,
|
user_data: dict,
|
||||||
|
__event_call__: Callable[[Any], Awaitable[None]] = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Calls the LLM to generate a summary using Open WebUI's built-in method.
|
Calls the LLM to generate a summary using Open WebUI's built-in method.
|
||||||
"""
|
"""
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[🤖 LLM Call] Using Open WebUI's built-in method")
|
f"[🤖 LLM Call] Using Open WebUI's built-in method",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# Build summary prompt (Optimized)
|
# Build summary prompt (Optimized)
|
||||||
summary_prompt = f"""
|
summary_prompt = f"""
|
||||||
@@ -935,8 +1090,7 @@ Based on the content above, generate the summary:
|
|||||||
# Determine the model to use
|
# Determine the model to use
|
||||||
model = self.valves.summary_model or body.get("model", "")
|
model = self.valves.summary_model or body.get("model", "")
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(f"[🤖 LLM Call] Model: {model}", event_call=__event_call__)
|
||||||
print(f"[🤖 LLM Call] Model: {model}")
|
|
||||||
|
|
||||||
# Build payload
|
# Build payload
|
||||||
payload = {
|
payload = {
|
||||||
@@ -954,18 +1108,19 @@ Based on the content above, generate the summary:
|
|||||||
raise ValueError("Could not get user ID")
|
raise ValueError("Could not get user ID")
|
||||||
|
|
||||||
# [Optimization] Get user object in a background thread to avoid blocking the event loop.
|
# [Optimization] Get user object in a background thread to avoid blocking the event loop.
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
"[Optimization] Getting user object in a background thread to avoid blocking the event loop.",
|
||||||
"[Optimization] Getting user object in a background thread to avoid blocking the event loop."
|
event_call=__event_call__,
|
||||||
)
|
)
|
||||||
user = await asyncio.to_thread(Users.get_user_by_id, user_id)
|
user = await asyncio.to_thread(Users.get_user_by_id, user_id)
|
||||||
|
|
||||||
if not user:
|
if not user:
|
||||||
raise ValueError(f"Could not find user: {user_id}")
|
raise ValueError(f"Could not find user: {user_id}")
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[🤖 LLM Call] User: {user.email}")
|
f"[🤖 LLM Call] User: {user.email}\n[🤖 LLM Call] Sending request...",
|
||||||
print(f"[🤖 LLM Call] Sending request...")
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# Create Request object
|
# Create Request object
|
||||||
request = Request(scope={"type": "http", "app": webui_app})
|
request = Request(scope={"type": "http", "app": webui_app})
|
||||||
@@ -978,8 +1133,11 @@ Based on the content above, generate the summary:
|
|||||||
|
|
||||||
summary = response["choices"][0]["message"]["content"].strip()
|
summary = response["choices"][0]["message"]["content"].strip()
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[🤖 LLM Call] ✅ Successfully received summary")
|
f"[🤖 LLM Call] ✅ Successfully received summary",
|
||||||
|
type="success",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
return summary
|
return summary
|
||||||
|
|
||||||
@@ -991,7 +1149,10 @@ Based on the content above, generate the summary:
|
|||||||
"If this is a pipeline (Pipe) model or an incompatible model, please specify a compatible summary model (e.g., 'gemini-2.5-flash') in the configuration."
|
"If this is a pipeline (Pipe) model or an incompatible model, please specify a compatible summary model (e.g., 'gemini-2.5-flash') in the configuration."
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[🤖 LLM Call] ❌ {error_message}")
|
f"[🤖 LLM Call] ❌ {error_message}",
|
||||||
|
type="error",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
raise Exception(error_message)
|
raise Exception(error_message)
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ author: Fu-Jie
|
|||||||
author_url: https://github.com/Fu-Jie
|
author_url: https://github.com/Fu-Jie
|
||||||
funding_url: https://github.com/Fu-Jie/awesome-openwebui
|
funding_url: https://github.com/Fu-Jie/awesome-openwebui
|
||||||
description: 通过智能摘要和消息压缩,降低长对话的 token 消耗,同时保持对话连贯性。
|
description: 通过智能摘要和消息压缩,降低长对话的 token 消耗,同时保持对话连贯性。
|
||||||
version: 1.1.0
|
version: 1.1.1
|
||||||
openwebui_id: 5c0617cb-a9e4-4bd6-a440-d276534ebd18
|
openwebui_id: 5c0617cb-a9e4-4bd6-a440-d276534ebd18
|
||||||
license: MIT
|
license: MIT
|
||||||
|
|
||||||
@@ -138,6 +138,10 @@ debug_mode (调试模式)
|
|||||||
默认: true
|
默认: true
|
||||||
说明: 在日志中打印详细的调试信息。生产环境建议设为 `false`。
|
说明: 在日志中打印详细的调试信息。生产环境建议设为 `false`。
|
||||||
|
|
||||||
|
show_debug_log (前端调试日志)
|
||||||
|
默认: false
|
||||||
|
说明: 在浏览器控制台打印调试日志 (F12)。便于前端调试。
|
||||||
|
|
||||||
🔧 部署配置
|
🔧 部署配置
|
||||||
═══════════════════════════════════════════════════════
|
═══════════════════════════════════════════════════════
|
||||||
|
|
||||||
@@ -345,6 +349,9 @@ class Filter:
|
|||||||
default=0.1, ge=0.0, le=2.0, description="摘要生成的温度参数"
|
default=0.1, ge=0.0, le=2.0, description="摘要生成的温度参数"
|
||||||
)
|
)
|
||||||
debug_mode: bool = Field(default=True, description="调试模式,打印详细日志")
|
debug_mode: bool = Field(default=True, description="调试模式,打印详细日志")
|
||||||
|
show_debug_log: bool = Field(
|
||||||
|
default=False, description="在浏览器控制台打印调试日志 (F12)"
|
||||||
|
)
|
||||||
|
|
||||||
def _save_summary(self, chat_id: str, summary: str, compressed_count: int):
|
def _save_summary(self, chat_id: str, summary: str, compressed_count: int):
|
||||||
"""保存摘要到数据库"""
|
"""保存摘要到数据库"""
|
||||||
@@ -426,9 +433,7 @@ class Filter:
|
|||||||
# 回退策略:粗略估算 (1 token ≈ 4 chars)
|
# 回退策略:粗略估算 (1 token ≈ 4 chars)
|
||||||
return len(text) // 4
|
return len(text) // 4
|
||||||
|
|
||||||
def _calculate_messages_tokens(
|
def _calculate_messages_tokens(self, messages: List[Dict]) -> int:
|
||||||
self, messages: List[Dict]
|
|
||||||
) -> int:
|
|
||||||
"""计算消息列表的总 Token 数"""
|
"""计算消息列表的总 Token 数"""
|
||||||
total_tokens = 0
|
total_tokens = 0
|
||||||
for msg in messages:
|
for msg in messages:
|
||||||
@@ -502,12 +507,109 @@ class Filter:
|
|||||||
|
|
||||||
return message
|
return message
|
||||||
|
|
||||||
|
async def _emit_debug_log(
|
||||||
|
self,
|
||||||
|
__event_call__,
|
||||||
|
chat_id: str,
|
||||||
|
original_count: int,
|
||||||
|
compressed_count: int,
|
||||||
|
summary_length: int,
|
||||||
|
kept_first: int,
|
||||||
|
kept_last: int,
|
||||||
|
):
|
||||||
|
"""Emit debug log to browser console via JS execution"""
|
||||||
|
if not self.valves.show_debug_log or not __event_call__:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Prepare data for JS
|
||||||
|
log_data = {
|
||||||
|
"chatId": chat_id,
|
||||||
|
"originalCount": original_count,
|
||||||
|
"compressedCount": compressed_count,
|
||||||
|
"summaryLength": summary_length,
|
||||||
|
"keptFirst": kept_first,
|
||||||
|
"keptLast": kept_last,
|
||||||
|
"ratio": (
|
||||||
|
f"{(1 - compressed_count/original_count)*100:.1f}%"
|
||||||
|
if original_count > 0
|
||||||
|
else "0%"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Construct JS code
|
||||||
|
js_code = f"""
|
||||||
|
(async function() {{
|
||||||
|
console.group("🗜️ Async Context Compression Debug");
|
||||||
|
console.log("Chat ID:", {json.dumps(chat_id)});
|
||||||
|
console.log("Messages:", {original_count} + " -> " + {compressed_count});
|
||||||
|
console.log("Compression Ratio:", {json.dumps(log_data['ratio'])});
|
||||||
|
console.log("Summary Length:", {summary_length} + " chars");
|
||||||
|
console.log("Configuration:", {{
|
||||||
|
"Keep First": {kept_first},
|
||||||
|
"Keep Last": {kept_last}
|
||||||
|
}});
|
||||||
|
console.groupEnd();
|
||||||
|
}})();
|
||||||
|
"""
|
||||||
|
|
||||||
|
await __event_call__(
|
||||||
|
{
|
||||||
|
"type": "execute",
|
||||||
|
"data": {"code": js_code},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error emitting debug log: {e}")
|
||||||
|
|
||||||
|
async def _log(self, message: str, type: str = "info", event_call=None):
|
||||||
|
"""统一日志输出到后端 (print) 和前端 (console.log)"""
|
||||||
|
# 后端日志
|
||||||
|
if self.valves.debug_mode:
|
||||||
|
print(message)
|
||||||
|
|
||||||
|
# 前端日志
|
||||||
|
if self.valves.show_debug_log and event_call:
|
||||||
|
try:
|
||||||
|
css = "color: #3b82f6;" # 默认蓝色
|
||||||
|
if type == "error":
|
||||||
|
css = "color: #ef4444; font-weight: bold;" # 红色
|
||||||
|
elif type == "warning":
|
||||||
|
css = "color: #f59e0b;" # 橙色
|
||||||
|
elif type == "success":
|
||||||
|
css = "color: #10b981; font-weight: bold;" # 绿色
|
||||||
|
|
||||||
|
# 清理前端消息:移除分隔符和多余换行
|
||||||
|
lines = message.split("\n")
|
||||||
|
# 保留不以大量等号或连字符开头的行
|
||||||
|
filtered_lines = [
|
||||||
|
line
|
||||||
|
for line in lines
|
||||||
|
if not line.strip().startswith("====")
|
||||||
|
and not line.strip().startswith("----")
|
||||||
|
]
|
||||||
|
clean_message = "\n".join(filtered_lines).strip()
|
||||||
|
|
||||||
|
if not clean_message:
|
||||||
|
return
|
||||||
|
|
||||||
|
# 转义消息中的引号和换行符
|
||||||
|
safe_message = clean_message.replace('"', '\\"').replace("\n", "\\n")
|
||||||
|
|
||||||
|
js_code = f"""
|
||||||
|
console.log("%c[压缩] {safe_message}", "{css}");
|
||||||
|
"""
|
||||||
|
await event_call({"type": "execute", "data": {"code": js_code}})
|
||||||
|
except Exception as e:
|
||||||
|
print(f"发送前端日志失败: {e}")
|
||||||
|
|
||||||
async def inlet(
|
async def inlet(
|
||||||
self,
|
self,
|
||||||
body: dict,
|
body: dict,
|
||||||
__user__: Optional[dict] = None,
|
__user__: Optional[dict] = None,
|
||||||
__metadata__: dict = None,
|
__metadata__: dict = None,
|
||||||
__event_emitter__: Callable[[Any], Awaitable[None]] = None,
|
__event_emitter__: Callable[[Any], Awaitable[None]] = None,
|
||||||
|
__event_call__: Callable[[Any], Awaitable[None]] = None,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""
|
"""
|
||||||
在发送到 LLM 之前执行
|
在发送到 LLM 之前执行
|
||||||
@@ -516,10 +618,11 @@ class Filter:
|
|||||||
messages = body.get("messages", [])
|
messages = body.get("messages", [])
|
||||||
chat_id = __metadata__["chat_id"]
|
chat_id = __metadata__["chat_id"]
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
if self.valves.debug_mode or self.valves.show_debug_log:
|
||||||
print(f"\n{'='*60}")
|
await self._log(
|
||||||
print(f"[Inlet] Chat ID: {chat_id}")
|
f"\n{'='*60}\n[Inlet] Chat ID: {chat_id}\n[Inlet] 收到 {len(messages)} 条消息",
|
||||||
print(f"[Inlet] 收到 {len(messages)} 条消息")
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# 记录原始消息的目标压缩进度,供 outlet 使用
|
# 记录原始消息的目标压缩进度,供 outlet 使用
|
||||||
# 目标是压缩到倒数第 keep_last 条之前
|
# 目标是压缩到倒数第 keep_last 条之前
|
||||||
@@ -527,13 +630,18 @@ class Filter:
|
|||||||
|
|
||||||
# [优化] 简单的状态清理检查
|
# [优化] 简单的状态清理检查
|
||||||
if chat_id in self.temp_state:
|
if chat_id in self.temp_state:
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[Inlet] ⚠️ 覆盖未消费的旧状态 (Chat ID: {chat_id})")
|
f"[Inlet] ⚠️ 覆盖未消费的旧状态 (Chat ID: {chat_id})",
|
||||||
|
type="warning",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
self.temp_state[chat_id] = target_compressed_count
|
self.temp_state[chat_id] = target_compressed_count
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[Inlet] 记录目标压缩进度: {target_compressed_count}")
|
f"[Inlet] 记录目标压缩进度: {target_compressed_count}",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# 加载摘要记录
|
# 加载摘要记录
|
||||||
summary_record = await asyncio.to_thread(self._load_summary_record, chat_id)
|
summary_record = await asyncio.to_thread(self._load_summary_record, chat_id)
|
||||||
@@ -582,19 +690,32 @@ class Filter:
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[Inlet] 应用摘要: Head({len(head_messages)}) + Summary + Tail({len(tail_messages)})",
|
||||||
f"[Inlet] 应用摘要: Head({len(head_messages)}) + Summary + Tail({len(tail_messages)})"
|
type="success",
|
||||||
)
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Emit debug log to frontend (Keep the structured log as well)
|
||||||
|
await self._emit_debug_log(
|
||||||
|
__event_call__,
|
||||||
|
chat_id,
|
||||||
|
len(messages),
|
||||||
|
len(final_messages),
|
||||||
|
len(summary_record.summary),
|
||||||
|
self.valves.keep_first,
|
||||||
|
self.valves.keep_last,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# 没有摘要,使用原始消息
|
# 没有摘要,使用原始消息
|
||||||
final_messages = messages
|
final_messages = messages
|
||||||
|
|
||||||
body["messages"] = final_messages
|
body["messages"] = final_messages
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[Inlet] 最终发送: {len(body['messages'])} 条消息")
|
f"[Inlet] 最终发送: {len(body['messages'])} 条消息\n{'='*60}\n",
|
||||||
print(f"{'='*60}\n")
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
return body
|
return body
|
||||||
|
|
||||||
@@ -604,6 +725,7 @@ class Filter:
|
|||||||
__user__: Optional[dict] = None,
|
__user__: Optional[dict] = None,
|
||||||
__metadata__: dict = None,
|
__metadata__: dict = None,
|
||||||
__event_emitter__: Callable[[Any], Awaitable[None]] = None,
|
__event_emitter__: Callable[[Any], Awaitable[None]] = None,
|
||||||
|
__event_call__: Callable[[Any], Awaitable[None]] = None,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""
|
"""
|
||||||
在 LLM 响应完成后执行
|
在 LLM 响应完成后执行
|
||||||
@@ -612,21 +734,23 @@ class Filter:
|
|||||||
chat_id = __metadata__["chat_id"]
|
chat_id = __metadata__["chat_id"]
|
||||||
model = body.get("model", "gpt-3.5-turbo")
|
model = body.get("model", "gpt-3.5-turbo")
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
if self.valves.debug_mode or self.valves.show_debug_log:
|
||||||
print(f"\n{'='*60}")
|
await self._log(
|
||||||
print(f"[Outlet] Chat ID: {chat_id}")
|
f"\n{'='*60}\n[Outlet] Chat ID: {chat_id}\n[Outlet] 响应完成",
|
||||||
print(f"[Outlet] 响应完成")
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# 在后台异步处理 Token 计算和摘要生成(不等待完成,不影响输出)
|
# 在后台异步处理 Token 计算和摘要生成(不等待完成,不影响输出)
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
self._check_and_generate_summary_async(
|
self._check_and_generate_summary_async(
|
||||||
chat_id, model, body, __user__, __event_emitter__
|
chat_id, model, body, __user__, __event_emitter__, __event_call__
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[Outlet] 后台处理已启动")
|
f"[Outlet] 后台处理已启动\n{'='*60}\n",
|
||||||
print(f"{'='*60}\n")
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
return body
|
return body
|
||||||
|
|
||||||
@@ -637,6 +761,7 @@ class Filter:
|
|||||||
body: dict,
|
body: dict,
|
||||||
user_data: Optional[dict],
|
user_data: Optional[dict],
|
||||||
__event_emitter__: Callable[[Any], Awaitable[None]] = None,
|
__event_emitter__: Callable[[Any], Awaitable[None]] = None,
|
||||||
|
__event_call__: Callable[[Any], Awaitable[None]] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
后台处理:计算 Token 数并生成摘要(不阻塞响应)
|
后台处理:计算 Token 数并生成摘要(不阻塞响应)
|
||||||
@@ -650,36 +775,50 @@ class Filter:
|
|||||||
"compression_threshold_tokens", self.valves.compression_threshold_tokens
|
"compression_threshold_tokens", self.valves.compression_threshold_tokens
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"\n[🔍 后台计算] 开始 Token 计数...")
|
f"\n[🔍 后台计算] 开始 Token 计数...",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# 在后台线程中计算 Token 数
|
# 在后台线程中计算 Token 数
|
||||||
current_tokens = await asyncio.to_thread(
|
current_tokens = await asyncio.to_thread(
|
||||||
self._calculate_messages_tokens, messages
|
self._calculate_messages_tokens, messages
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[🔍 后台计算] Token 数: {current_tokens}")
|
f"[🔍 后台计算] Token 数: {current_tokens}",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# 检查是否需要压缩
|
# 检查是否需要压缩
|
||||||
if current_tokens >= compression_threshold_tokens:
|
if current_tokens >= compression_threshold_tokens:
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[🔍 后台计算] ⚡ 触发压缩阈值 (Token: {current_tokens} >= {compression_threshold_tokens})",
|
||||||
f"[🔍 后台计算] ⚡ 触发压缩阈值 (Token: {current_tokens} >= {compression_threshold_tokens})"
|
type="warning",
|
||||||
)
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# 继续生成摘要
|
# 继续生成摘要
|
||||||
await self._generate_summary_async(
|
await self._generate_summary_async(
|
||||||
messages, chat_id, body, user_data, __event_emitter__
|
messages,
|
||||||
|
chat_id,
|
||||||
|
body,
|
||||||
|
user_data,
|
||||||
|
__event_emitter__,
|
||||||
|
__event_call__,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[🔍 后台计算] 未触发压缩阈值 (Token: {current_tokens} < {compression_threshold_tokens})",
|
||||||
f"[🔍 后台计算] 未触发压缩阈值 (Token: {current_tokens} < {compression_threshold_tokens})"
|
event_call=__event_call__,
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[🔍 后台计算] ❌ 错误: {str(e)}")
|
await self._log(
|
||||||
|
f"[🔍 后台计算] ❌ 错误: {str(e)}",
|
||||||
|
type="error",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
async def _generate_summary_async(
|
async def _generate_summary_async(
|
||||||
self,
|
self,
|
||||||
@@ -688,6 +827,7 @@ class Filter:
|
|||||||
body: dict,
|
body: dict,
|
||||||
user_data: Optional[dict],
|
user_data: Optional[dict],
|
||||||
__event_emitter__: Callable[[Any], Awaitable[None]] = None,
|
__event_emitter__: Callable[[Any], Awaitable[None]] = None,
|
||||||
|
__event_call__: Callable[[Any], Awaitable[None]] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
异步生成摘要(后台执行,不阻塞响应)
|
异步生成摘要(后台执行,不阻塞响应)
|
||||||
@@ -697,18 +837,18 @@ class Filter:
|
|||||||
3. 对剩余的中间消息生成摘要。
|
3. 对剩余的中间消息生成摘要。
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
if self.valves.debug_mode:
|
await self._log(f"\n[🤖 异步摘要任务] 开始...", event_call=__event_call__)
|
||||||
print(f"\n[🤖 异步摘要任务] 开始...")
|
|
||||||
|
|
||||||
# 1. 获取目标压缩进度
|
# 1. 获取目标压缩进度
|
||||||
# 优先从 temp_state 获取(由 inlet 计算),如果获取不到(例如重启后),则假设当前是完整历史
|
# 优先从 temp_state 获取(由 inlet 计算),如果获取不到(例如重启后),则假设当前是完整历史
|
||||||
target_compressed_count = self.temp_state.pop(chat_id, None)
|
target_compressed_count = self.temp_state.pop(chat_id, None)
|
||||||
if target_compressed_count is None:
|
if target_compressed_count is None:
|
||||||
target_compressed_count = max(0, len(messages) - self.valves.keep_last)
|
target_compressed_count = max(0, len(messages) - self.valves.keep_last)
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[🤖 异步摘要任务] ⚠️ 无法获取 inlet 状态,使用当前消息数估算进度: {target_compressed_count}",
|
||||||
f"[🤖 异步摘要任务] ⚠️ 无法获取 inlet 状态,使用当前消息数估算进度: {target_compressed_count}"
|
type="warning",
|
||||||
)
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# 2. 确定待压缩的消息范围 (Middle)
|
# 2. 确定待压缩的消息范围 (Middle)
|
||||||
start_index = self.valves.keep_first
|
start_index = self.valves.keep_first
|
||||||
@@ -718,16 +858,18 @@ class Filter:
|
|||||||
|
|
||||||
# 确保索引有效
|
# 确保索引有效
|
||||||
if start_index >= end_index:
|
if start_index >= end_index:
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[🤖 异步摘要任务] 中间消息为空 (Start: {start_index}, End: {end_index}),跳过",
|
||||||
f"[🤖 异步摘要任务] 中间消息为空 (Start: {start_index}, End: {end_index}),跳过"
|
event_call=__event_call__,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
middle_messages = messages[start_index:end_index]
|
middle_messages = messages[start_index:end_index]
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[🤖 异步摘要任务] 待处理中间消息: {len(middle_messages)} 条")
|
f"[🤖 异步摘要任务] 待处理中间消息: {len(middle_messages)} 条",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# 3. 检查 Token 上限并截断 (Max Context Truncation)
|
# 3. 检查 Token 上限并截断 (Max Context Truncation)
|
||||||
# [优化] 使用摘要模型(如果有)的阈值来决定能处理多少中间消息
|
# [优化] 使用摘要模型(如果有)的阈值来决定能处理多少中间消息
|
||||||
@@ -740,22 +882,26 @@ class Filter:
|
|||||||
"max_context_tokens", self.valves.max_context_tokens
|
"max_context_tokens", self.valves.max_context_tokens
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[🤖 异步摘要任务] 使用模型 {summary_model_id} 的上限: {max_context_tokens} Tokens",
|
||||||
f"[🤖 异步摘要任务] 使用模型 {summary_model_id} 的上限: {max_context_tokens} Tokens"
|
event_call=__event_call__,
|
||||||
)
|
|
||||||
|
|
||||||
# 计算当前总 Token (使用摘要模型进行计数)
|
|
||||||
total_tokens = await asyncio.to_thread(
|
|
||||||
self._calculate_messages_tokens, messages
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if total_tokens > max_context_tokens:
|
# 计算中间消息的 Token (加上提示词的缓冲)
|
||||||
excess_tokens = total_tokens - max_context_tokens
|
# 我们只把 middle_messages 发送给摘要模型,所以不应该把完整历史计入限制
|
||||||
if self.valves.debug_mode:
|
middle_tokens = await asyncio.to_thread(
|
||||||
print(
|
self._calculate_messages_tokens, middle_messages
|
||||||
f"[🤖 异步摘要任务] ⚠️ 总 Token ({total_tokens}) 超过摘要模型上限 ({max_context_tokens}),需要移除约 {excess_tokens} Token"
|
)
|
||||||
)
|
# 增加提示词和输出的缓冲 (约 2000 Tokens)
|
||||||
|
estimated_input_tokens = middle_tokens + 2000
|
||||||
|
|
||||||
|
if estimated_input_tokens > max_context_tokens:
|
||||||
|
excess_tokens = estimated_input_tokens - max_context_tokens
|
||||||
|
await self._log(
|
||||||
|
f"[🤖 异步摘要任务] ⚠️ 中间消息 ({middle_tokens} Tokens) + 缓冲超过摘要模型上限 ({max_context_tokens}),需要移除约 {excess_tokens} Token",
|
||||||
|
type="warning",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# 从 middle_messages 头部开始移除
|
# 从 middle_messages 头部开始移除
|
||||||
removed_tokens = 0
|
removed_tokens = 0
|
||||||
@@ -769,14 +915,16 @@ class Filter:
|
|||||||
removed_tokens += msg_tokens
|
removed_tokens += msg_tokens
|
||||||
removed_count += 1
|
removed_count += 1
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(
|
f"[🤖 异步摘要任务] 已移除 {removed_count} 条消息,共 {removed_tokens} Token",
|
||||||
f"[🤖 异步摘要任务] 已移除 {removed_count} 条消息,共 {removed_tokens} Token"
|
event_call=__event_call__,
|
||||||
)
|
)
|
||||||
|
|
||||||
if not middle_messages:
|
if not middle_messages:
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[🤖 异步摘要任务] 截断后中间消息为空,跳过摘要生成")
|
f"[🤖 异步摘要任务] 截断后中间消息为空,跳过摘要生成",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
# 4. 构建对话文本
|
# 4. 构建对话文本
|
||||||
@@ -798,12 +946,14 @@ class Filter:
|
|||||||
)
|
)
|
||||||
|
|
||||||
new_summary = await self._call_summary_llm(
|
new_summary = await self._call_summary_llm(
|
||||||
None, conversation_text, body, user_data
|
None, conversation_text, body, user_data, __event_call__
|
||||||
)
|
)
|
||||||
|
|
||||||
# 6. 保存新摘要
|
# 6. 保存新摘要
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print("[优化] 正在后台线程中保存摘要,以避免阻塞事件循环。")
|
"[优化] 在后台线程中保存摘要以避免阻塞事件循环。",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
await asyncio.to_thread(
|
await asyncio.to_thread(
|
||||||
self._save_summary, chat_id, new_summary, target_compressed_count
|
self._save_summary, chat_id, new_summary, target_compressed_count
|
||||||
@@ -815,32 +965,40 @@ class Filter:
|
|||||||
{
|
{
|
||||||
"type": "status",
|
"type": "status",
|
||||||
"data": {
|
"data": {
|
||||||
"description": f"上下文摘要已更新 (已压缩 {len(middle_messages)} 条消息)",
|
"description": f"上下文摘要已更新 (压缩了 {len(middle_messages)} 条消息)",
|
||||||
"done": True,
|
"done": True,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[🤖 异步摘要任务] ✅ 完成!新摘要长度: {len(new_summary)} 字符")
|
f"[🤖 异步摘要任务] ✅ 完成!新摘要长度: {len(new_summary)} 字符",
|
||||||
print(
|
type="success",
|
||||||
f"[🤖 异步摘要任务] 进度更新: 已压缩至原始第 {target_compressed_count} 条消息"
|
event_call=__event_call__,
|
||||||
)
|
)
|
||||||
|
await self._log(
|
||||||
|
f"[🤖 异步摘要任务] 进度更新: 已压缩至原始消息 {target_compressed_count}",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[🤖 异步摘要任务] ❌ 错误: {str(e)}")
|
await self._log(
|
||||||
|
f"[🤖 异步摘要任务] ❌ 错误: {str(e)}",
|
||||||
|
type="error",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
def _format_messages_for_summary(self, messages: list) -> str:
|
def _format_messages_for_summary(self, messages: list) -> str:
|
||||||
"""格式化消息用于摘要"""
|
"""Formats messages for summarization."""
|
||||||
formatted = []
|
formatted = []
|
||||||
for i, msg in enumerate(messages, 1):
|
for i, msg in enumerate(messages, 1):
|
||||||
role = msg.get("role", "unknown")
|
role = msg.get("role", "unknown")
|
||||||
content = msg.get("content", "")
|
content = msg.get("content", "")
|
||||||
|
|
||||||
# 处理多模态内容
|
# Handle multimodal content
|
||||||
if isinstance(content, list):
|
if isinstance(content, list):
|
||||||
text_parts = []
|
text_parts = []
|
||||||
for part in content:
|
for part in content:
|
||||||
@@ -848,10 +1006,10 @@ class Filter:
|
|||||||
text_parts.append(part.get("text", ""))
|
text_parts.append(part.get("text", ""))
|
||||||
content = " ".join(text_parts)
|
content = " ".join(text_parts)
|
||||||
|
|
||||||
# 处理角色名称
|
# Handle role name
|
||||||
role_name = {"user": "用户", "assistant": "助手"}.get(role, role)
|
role_name = {"user": "User", "assistant": "Assistant"}.get(role, role)
|
||||||
|
|
||||||
# 限制每条消息的长度,避免过长
|
# Limit length of each message to avoid excessive length
|
||||||
if len(content) > 500:
|
if len(content) > 500:
|
||||||
content = content[:500] + "..."
|
content = content[:500] + "..."
|
||||||
|
|
||||||
@@ -865,12 +1023,15 @@ class Filter:
|
|||||||
new_conversation_text: str,
|
new_conversation_text: str,
|
||||||
body: dict,
|
body: dict,
|
||||||
user_data: dict,
|
user_data: dict,
|
||||||
|
__event_call__: Callable[[Any], Awaitable[None]] = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
使用 Open WebUI 内置方法调用 LLM 生成摘要
|
调用 LLM 生成摘要,使用 Open Web UI 的内置方法。
|
||||||
"""
|
"""
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[🤖 LLM 调用] 使用 Open WebUI 内置方法")
|
f"[🤖 LLM 调用] 使用 Open Web UI 内置方法",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# 构建摘要提示词 (优化版)
|
# 构建摘要提示词 (优化版)
|
||||||
summary_prompt = f"""
|
summary_prompt = f"""
|
||||||
@@ -909,8 +1070,7 @@ class Filter:
|
|||||||
# 确定使用的模型
|
# 确定使用的模型
|
||||||
model = self.valves.summary_model or body.get("model", "")
|
model = self.valves.summary_model or body.get("model", "")
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(f"[🤖 LLM 调用] 模型: {model}", event_call=__event_call__)
|
||||||
print(f"[🤖 LLM 调用] 模型: {model}")
|
|
||||||
|
|
||||||
# 构建 payload
|
# 构建 payload
|
||||||
payload = {
|
payload = {
|
||||||
@@ -927,17 +1087,20 @@ class Filter:
|
|||||||
if not user_id:
|
if not user_id:
|
||||||
raise ValueError("无法获取用户 ID")
|
raise ValueError("无法获取用户 ID")
|
||||||
|
|
||||||
# [优化] 在后台线程中获取用户对象,以避免阻塞事件循环
|
# [优化] 在后台线程中获取用户对象以避免阻塞事件循环
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print("[优化] 正在后台线程中获取用户对象,以避免阻塞事件循环。")
|
"[优化] 在后台线程中获取用户对象以避免阻塞事件循环。",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
user = await asyncio.to_thread(Users.get_user_by_id, user_id)
|
user = await asyncio.to_thread(Users.get_user_by_id, user_id)
|
||||||
|
|
||||||
if not user:
|
if not user:
|
||||||
raise ValueError(f"无法找到用户: {user_id}")
|
raise ValueError(f"无法找到用户: {user_id}")
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[🤖 LLM 调用] 用户: {user.email}")
|
f"[🤖 LLM 调用] 用户: {user.email}\n[🤖 LLM 调用] 发送请求...",
|
||||||
print(f"[🤖 LLM 调用] 发送请求...")
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
# 创建 Request 对象
|
# 创建 Request 对象
|
||||||
request = Request(scope={"type": "http", "app": webui_app})
|
request = Request(scope={"type": "http", "app": webui_app})
|
||||||
@@ -950,8 +1113,11 @@ class Filter:
|
|||||||
|
|
||||||
summary = response["choices"][0]["message"]["content"].strip()
|
summary = response["choices"][0]["message"]["content"].strip()
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[🤖 LLM 调用] ✅ 成功获取摘要")
|
f"[🤖 LLM 调用] ✅ 成功接收摘要",
|
||||||
|
type="success",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
return summary
|
return summary
|
||||||
|
|
||||||
@@ -959,11 +1125,14 @@ class Filter:
|
|||||||
error_message = f"调用 LLM ({model}) 生成摘要时发生错误: {str(e)}"
|
error_message = f"调用 LLM ({model}) 生成摘要时发生错误: {str(e)}"
|
||||||
if not self.valves.summary_model:
|
if not self.valves.summary_model:
|
||||||
error_message += (
|
error_message += (
|
||||||
"\n[提示] 您没有指定摘要模型 (summary_model),因此尝试使用当前对话的模型。"
|
"\n[提示] 您未指定 summary_model,因此过滤器尝试使用当前对话的模型。"
|
||||||
"如果这是一个流水线(Pipe)模型或不兼容的模型,请在配置中指定一个兼容的摘要模型(如 'gemini-2.5-flash')。"
|
"如果这是流水线 (Pipe) 模型或不兼容的模型,请在配置中指定兼容的摘要模型 (例如 'gemini-2.5-flash')。"
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.valves.debug_mode:
|
await self._log(
|
||||||
print(f"[🤖 LLM 调用] ❌ {error_message}")
|
f"[🤖 LLM 调用] ❌ {error_message}",
|
||||||
|
type="error",
|
||||||
|
event_call=__event_call__,
|
||||||
|
)
|
||||||
|
|
||||||
raise Exception(error_message)
|
raise Exception(error_message)
|
||||||
|
|||||||
Reference in New Issue
Block a user