fix: harden async compression compatibility

Co-authored-by: Fu-Jie <33599649+Fu-Jie@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot]
2026-01-11 08:24:56 +00:00
parent f479f23b38
commit d2f35ce396

View File

@@ -257,9 +257,7 @@ from fastapi.requests import Request
from open_webui.main import app as webui_app from open_webui.main import app as webui_app
# Open WebUI internal database (re-use shared connection) # Open WebUI internal database (re-use shared connection)
from open_webui.internal.db import engine as owui_engine import open_webui.internal.db as owui_db
from open_webui.internal.db import Session as owui_Session
from open_webui.internal.db import Base as owui_Base
# Try to import tiktoken # Try to import tiktoken
try: try:
@@ -272,6 +270,9 @@ from sqlalchemy import Column, String, Text, DateTime, Integer, inspect
from datetime import datetime from datetime import datetime
owui_Base = owui_db.Base
class ChatSummary(owui_Base): class ChatSummary(owui_Base):
"""Chat Summary Storage Table""" """Chat Summary Storage Table"""
@@ -289,8 +290,14 @@ class ChatSummary(owui_Base):
class Filter: class Filter:
def __init__(self): def __init__(self):
self.valves = self.Valves() self.valves = self.Valves()
self._db_engine = owui_engine self._db_engine = owui_db.engine
self._SessionLocal = owui_Session self._SessionLocal = (
getattr(owui_db, "ScopedSession", None)
or getattr(owui_db, "SessionLocal", None)
or getattr(owui_db, "Session", None)
)
if self._SessionLocal is None:
raise RuntimeError("Open WebUI database session factory unavailable.")
self.temp_state = {} # Used to pass temporary data between inlet and outlet self.temp_state = {} # Used to pass temporary data between inlet and outlet
self._init_database() self._init_database()
@@ -632,7 +639,15 @@ class Filter:
Compression Strategy: Only responsible for injecting existing summaries, no Token calculation. Compression Strategy: Only responsible for injecting existing summaries, no Token calculation.
""" """
messages = body.get("messages", []) messages = body.get("messages", [])
chat_id = __metadata__["chat_id"] chat_id = (__metadata__ or {}).get("chat_id")
if not chat_id:
await self._log(
"[Inlet] ❌ Missing chat_id in metadata, skipping compression",
type="error",
event_call=__event_call__,
)
return body
if self.valves.debug_mode or self.valves.show_debug_log: if self.valves.debug_mode or self.valves.show_debug_log:
await self._log( await self._log(
@@ -747,7 +762,14 @@ class Filter:
Executed after the LLM response is complete. Executed after the LLM response is complete.
Calculates Token count in the background and triggers summary generation (does not block current response, does not affect content output). Calculates Token count in the background and triggers summary generation (does not block current response, does not affect content output).
""" """
chat_id = __metadata__["chat_id"] chat_id = (__metadata__ or {}).get("chat_id")
if not chat_id:
await self._log(
"[Outlet] ❌ Missing chat_id in metadata, skipping compression",
type="error",
event_call=__event_call__,
)
return body
model = body.get("model", "gpt-3.5-turbo") model = body.get("model", "gpt-3.5-turbo")
if self.valves.debug_mode or self.valves.show_debug_log: if self.valves.debug_mode or self.valves.show_debug_log:
@@ -892,10 +914,20 @@ class Filter:
# 3. Check Token limit and truncate (Max Context Truncation) # 3. Check Token limit and truncate (Max Context Truncation)
# [Optimization] Use the summary model's (if any) threshold to decide how many middle messages can be processed # [Optimization] Use the summary model's (if any) threshold to decide how many middle messages can be processed
# This allows using a long-window model (like gemini-flash) to compress history exceeding the current model's window # This allows using a long-window model (like gemini-flash) to compress history exceeding the current model's window
summary_model_id = self.valves.summary_model or body.get( summary_model_id = (
"model", "gpt-3.5-turbo" self.valves.summary_model
or body.get("model")
or "gpt-3.5-turbo"
) )
if not summary_model_id:
await self._log(
"[🤖 Async Summary Task] ⚠️ Summary model is empty, skipping compression",
type="warning",
event_call=__event_call__,
)
return
thresholds = self._get_model_thresholds(summary_model_id) thresholds = self._get_model_thresholds(summary_model_id)
# Note: Using the summary model's max context limit here # Note: Using the summary model's max context limit here
max_context_tokens = thresholds.get( max_context_tokens = thresholds.get(
@@ -963,12 +995,24 @@ class Filter:
"done": False, "done": False,
}, },
} }
) )
new_summary = await self._call_summary_llm( new_summary = await self._call_summary_llm(
None, conversation_text, body, user_data, __event_call__ None,
conversation_text,
{**body, "model": summary_model_id},
user_data,
__event_call__,
) )
if not new_summary:
await self._log(
"[🤖 Async Summary Task] ⚠️ Summary generation returned empty result, skipping save",
type="warning",
event_call=__event_call__,
)
return
# 6. Save new summary # 6. Save new summary
await self._log( await self._log(
"[Optimization] Saving summary in a background thread to avoid blocking the event loop.", "[Optimization] Saving summary in a background thread to avoid blocking the event loop.",
@@ -1090,6 +1134,14 @@ Based on the content above, generate the summary:
# Determine the model to use # Determine the model to use
model = self.valves.summary_model or body.get("model", "") model = self.valves.summary_model or body.get("model", "")
if not model:
await self._log(
"[🤖 LLM Call] ⚠️ Model ID is empty, skipping summary generation",
type="warning",
event_call=__event_call__,
)
return ""
await self._log(f"[🤖 LLM Call] Model: {model}", event_call=__event_call__) await self._log(f"[🤖 LLM Call] Model: {model}", event_call=__event_call__)
# Build payload # Build payload