From adc5e0a1f4bb2749bdd5ee06ff4dbfef7a42cba6 Mon Sep 17 00:00:00 2001 From: fujie Date: Sat, 21 Feb 2026 23:44:12 +0800 Subject: [PATCH] feat(filters): release v1.3.0 for async context compression - Add native i18n support across 9 languages - Implement non-blocking frontend log emission for zero TTFB delay - Add token_usage_status_threshold to intelligently control status notifications - Automatically detect and skip compression for copilot_sdk models - Set debug_mode default to false for a quieter production environment - Update documentation and remove legacy bilingual code --- .../filters/async-context-compression.md | 174 +- .../filters/async-context-compression.zh.md | 206 +- docs/plugins/filters/index.md | 2 +- docs/plugins/filters/index.zh.md | 2 +- .../async-context-compression/README.md | 16 +- .../async-context-compression/README_CN.md | 15 +- .../async_context_compression.py | 737 ++++-- .../async_context_compression_cn.py | 2028 ----------------- 8 files changed, 771 insertions(+), 2409 deletions(-) delete mode 100644 plugins/filters/async-context-compression/async_context_compression_cn.py diff --git a/docs/plugins/filters/async-context-compression.md b/docs/plugins/filters/async-context-compression.md index af88ad4..fe94989 100644 --- a/docs/plugins/filters/async-context-compression.md +++ b/docs/plugins/filters/async-context-compression.md @@ -1,137 +1,81 @@ -# Async Context Compression +# Async Context Compression Filter -Filter -v1.2.2 +**Author:** [Fu-Jie](https://github.com/Fu-Jie/openwebui-extensions) | **Version:** 1.3.0 | **Project:** [OpenWebUI Extensions](https://github.com/Fu-Jie/openwebui-extensions) | **License:** MIT -Reduces token consumption in long conversations through intelligent summarization while maintaining conversational coherence. +This filter reduces token consumption in long conversations through intelligent summarization and message compression while keeping conversations coherent. + +## What's new in 1.3.0 + +- **Internationalization (i18n)**: Complete localization of user-facing messages across 9 languages (English, Chinese, Japanese, Korean, French, German, Spanish, Italian). +- **Smart Status Display**: Added `token_usage_status_threshold` valve (default 80%) to intelligently control when token usage status is shown. +- **Improved Performance**: Frontend language detection and logging are optimized to be completely non-blocking, maintaining lightning-fast TTFB. +- **Copilot SDK Integration**: Automatically detects and skips compression for copilot_sdk based models to prevent conflicts. +- **Configuration**: `debug_mode` is now set to `false` by default for a quieter production experience. --- -## Overview +## Core Features -The Async Context Compression filter helps manage token usage in long conversations by: - -- Intelligently summarizing older messages -- Preserving important context -- Reducing API costs -- Maintaining conversation coherence - -This is especially useful for: - -- Long-running conversations -- Complex multi-turn discussions -- Cost optimization -- Token limit management - -## Features - -- :material-arrow-collapse-vertical: **Smart Compression**: AI-powered context summarization -- :material-clock-fast: **Async Processing**: Non-blocking background compression -- :material-memory: **Context Preservation**: Keeps important information -- :material-currency-usd-off: **Cost Reduction**: Minimize token usage -- :material-console: **Frontend Debugging**: Debug logs in browser console -- :material-alert-circle-check: **Enhanced Error Reporting**: Clear error status notifications -- :material-check-all: **Open WebUI v0.7.x Compatibility**: Dynamic DB session handling -- :material-account-convert: **Improved Compatibility**: Summary role changed to `assistant` -- :material-shield-check: **Enhanced Stability**: Resolved race conditions in state management -- :material-ruler: **Preflight Context Check**: Validates context fit before sending -- :material-format-align-justify: **Structure-Aware Trimming**: Preserves document structure -- :material-content-cut: **Native Tool Output Trimming**: Trims verbose tool outputs (Note: Non-native tool outputs are not fully injected into context) -- :material-chart-bar: **Detailed Token Logging**: Granular token breakdown -- :material-account-search: **Smart Model Matching**: Inherit config from base models -- :material-image-off: **Multimodal Support**: Images are preserved but tokens are **NOT** calculated +- ✅ **Full i18n Support**: Native localization across 9 languages. +- ✅ Automatic compression triggered by token thresholds. +- ✅ Asynchronous summarization that does not block chat responses. +- ✅ Persistent storage via Open WebUI's shared database connection (PostgreSQL, SQLite, etc.). +- ✅ Flexible retention policy to keep the first and last N messages. +- ✅ Smart injection of historical summaries back into the context. +- ✅ Structure-aware trimming that preserves document structure (headers, intro, conclusion). +- ✅ Native tool output trimming for cleaner context when using function calling. +- ✅ Real-time context usage monitoring with warning notifications (>90%). +- ✅ Detailed token logging for precise debugging and optimization. +- ✅ **Smart Model Matching**: Automatically inherits configuration from base models for custom presets. +- ⚠ **Multimodal Support**: Images are preserved but their tokens are **NOT** calculated. Please adjust thresholds accordingly. --- -## Installation +## Installation & Configuration -1. Download the plugin file: [`async_context_compression.py`](https://github.com/Fu-Jie/openwebui-extensions/tree/main/plugins/filters/async-context-compression) -2. Upload to OpenWebUI: **Admin Panel** → **Settings** → **Functions** -3. Configure compression settings -4. Enable the filter +### 1) Database (automatic) + +- Uses Open WebUI's shared database connection; no extra configuration needed. +- The `chat_summary` table is created on first run. + +### 2) Filter order + +- Recommended order: pre-filters (<10) → this filter (10) → post-filters (>10). --- -## How It Works +## Configuration Parameters -```mermaid -graph TD - A[Incoming Messages] --> B{Token Count > Threshold?} - B -->|No| C[Pass Through] - B -->|Yes| D[Summarize Older Messages] - D --> E[Preserve Recent Messages] - E --> F[Combine Summary + Recent] - F --> G[Send to LLM] -``` +| Parameter | Default | Description | +| :----------------------------- | :------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `priority` | `10` | Execution order; lower runs earlier. | +| `compression_threshold_tokens` | `64000` | Trigger asynchronous summary when total tokens exceed this value. Set to 50%-70% of your model's context window. | +| `max_context_tokens` | `128000` | Hard cap for context; older messages (except protected ones) are dropped if exceeded. | +| `keep_first` | `1` | Always keep the first N messages (protects system prompts). | +| `keep_last` | `6` | Always keep the last N messages to preserve recent context. | +| `summary_model` | `None` | Model for summaries. Strongly recommended to set a fast, economical model (e.g., `gemini-2.5-flash`, `deepseek-v3`). Falls back to the current chat model when empty. | +| `summary_model_max_context` | `0` | Max context tokens for the summary model. If 0, falls back to `model_thresholds` or global `max_context_tokens`. | +| `max_summary_tokens` | `16384` | Maximum tokens for the generated summary. | +| `summary_temperature` | `0.3` | Randomness for summary generation. Lower is more deterministic. | +| `model_thresholds` | `{}` | Per-model overrides for `compression_threshold_tokens` and `max_context_tokens` (useful for mixed models). | +| `enable_tool_output_trimming` | `false` | When enabled and `function_calling: "native"` is active, trims verbose tool outputs to extract only the final answer. | +| `debug_mode` | `false` | Log verbose debug info. Set to `false` in production. | +| `show_debug_log` | `false` | Print debug logs to browser console (F12). Useful for frontend debugging. | +| `show_token_usage_status` | `true` | Show token usage status notification in the chat interface. | +| `token_usage_status_threshold` | `80` | The minimum usage percentage (0-100) required to show a context usage status notification. | --- -## Configuration +## ⭐ Support -| Option | Type | Default | Description | -|--------|------|---------|-------------| -| `compression_threshold_tokens` | integer | `64000` | Trigger compression above this token count | -| `max_context_tokens` | integer | `128000` | Hard limit for context | -| `keep_first` | integer | `1` | Always keep the first N messages | -| `keep_last` | integer | `6` | Always keep the last N messages | -| `summary_model` | string | `None` | Model to use for summarization | -| `summary_model_max_context` | integer | `0` | Max context tokens for summary model | -| `max_summary_tokens` | integer | `16384` | Maximum tokens for the summary | -| `enable_tool_output_trimming` | boolean | `false` | Enable trimming of large tool outputs | +If this plugin has been useful, a star on [OpenWebUI Extensions](https://github.com/Fu-Jie/openwebui-extensions) is a big motivation for me. Thank you for the support. ---- +## Troubleshooting ❓ -## Example +- **Initial system prompt is lost**: Keep `keep_first` greater than 0 to protect the initial message. +- **Compression effect is weak**: Raise `compression_threshold_tokens` or lower `keep_first` / `keep_last` to allow more aggressive compression. +- **Submit an Issue**: If you encounter any problems, please submit an issue on GitHub: [OpenWebUI Extensions Issues](https://github.com/Fu-Jie/openwebui-extensions/issues) -### Before Compression +## Changelog -``` -[Message 1] User: Tell me about Python... -[Message 2] AI: Python is a programming language... -[Message 3] User: What about its history? -[Message 4] AI: Python was created by Guido... -[Message 5] User: And its features? -[Message 6] AI: Python has many features... -... (many more messages) -[Message 20] User: Current question -``` - -### After Compression - -``` -[Summary] Previous conversation covered Python basics, -history, features, and common use cases... - -[Message 18] User: Recent question about decorators -[Message 19] AI: Decorators in Python are... -[Message 20] User: Current question -``` - ---- - -## Requirements - -!!! note "Prerequisites" - - OpenWebUI v0.3.0 or later - - Access to an LLM for summarization - -!!! tip "Best Practices" - - Set appropriate token thresholds based on your model's context window - - Preserve more recent messages for technical discussions - - Test compression settings in non-critical conversations first - ---- - -## Troubleshooting - -??? question "Compression not triggering?" - Check if the token count exceeds your configured threshold. Enable debug logging for more details. - -??? question "Important context being lost?" - Increase the `preserve_recent` setting or lower the compression ratio. - ---- - -## Source Code - -[:fontawesome-brands-github: View on GitHub](https://github.com/Fu-Jie/openwebui-extensions/tree/main/plugins/filters/async-context-compression){ .md-button } +See the full history on GitHub: [OpenWebUI Extensions](https://github.com/Fu-Jie/openwebui-extensions) diff --git a/docs/plugins/filters/async-context-compression.zh.md b/docs/plugins/filters/async-context-compression.zh.md index 13a8fb7..797f19c 100644 --- a/docs/plugins/filters/async-context-compression.zh.md +++ b/docs/plugins/filters/async-context-compression.zh.md @@ -1,137 +1,119 @@ -# Async Context Compression(异步上下文压缩) +# 异步上下文压缩过滤器 -Filter -v1.2.2 +**作者:** [Fu-Jie](https://github.com/Fu-Jie/openwebui-extensions) | **版本:** 1.3.0 | **项目:** [OpenWebUI Extensions](https://github.com/Fu-Jie/openwebui-extensions) | **许可证:** MIT -通过智能摘要减少长对话的 token 消耗,同时保持对话连贯。 +> **重要提示**:为了确保所有过滤器的可维护性和易用性,每个过滤器都应附带清晰、完整的文档,以确保其功能、配置和使用方法得到充分说明。 + +本过滤器通过智能摘要和消息压缩技术,在保持对话连贯性的同时,显著降低长对话的 Token 消耗。 + +## 1.3.0 版本更新 + +- **国际化 (i18n) 支持**: 完成了所有用户可见消息的本地化,现已原生支持 9 种语言(含中、英、日、韩及欧洲主要语言)。 +- **智能状态显示**: 新增 `token_usage_status_threshold` 阀门(默认 80%),可以智能控制何时显示 Token 用量状态,减少不必要的打扰。 +- **性能大幅优化**: 对前端语言检测和日志处理流程进行了非阻塞重构,完全不影响首字节响应时间(TTFB),保持毫秒级极速推流。 +- **Copilot SDK 兼容**: 自动检测并跳过基于 `copilot_sdk` 模型的上下文压缩,避免冲突。 +- **配置项调整**: 为了提供更安静的生产环境体验,`debug_mode` 现已默认设置为 `false`。 --- -## 概览 +## 核心特性 -Async Context Compression 过滤器通过以下方式帮助管理长对话的 token 使用: +- ✅ **全方位国际化**: 原生支持 9 种界面语言。 +- ✅ **自动压缩**: 基于 Token 阈值自动触发上下文压缩。 +- ✅ **异步摘要**: 后台生成摘要,不阻塞当前对话响应。 +- ✅ **持久化存储**: 复用 Open WebUI 共享数据库连接,自动支持 PostgreSQL/SQLite 等。 +- ✅ **灵活保留策略**: 可配置保留对话头部和尾部消息,确保关键信息连贯。 +- ✅ **智能注入**: 将历史摘要智能注入到新上下文中。 +- ✅ **结构感知裁剪**: 智能折叠过长消息,保留文档骨架(标题、首尾)。 +- ✅ **原生工具输出裁剪**: 支持裁剪冗长的工具调用输出。 +- ✅ **实时监控**: 实时监控上下文使用情况,超过 90% 发出警告。 +- ✅ **详细日志**: 提供精确的 Token 统计日志,便于调试。 +- ✅ **智能模型匹配**: 自定义模型自动继承基础模型的阈值配置。 +- ⚠ **多模态支持**: 图片内容会被保留,但其 Token **不参与计算**。请相应调整阈值。 -- 智能总结较早的消息 -- 保留关键信息 -- 降低 API 成本 -- 保持对话一致性 - -特别适用于: - -- 长时间会话 -- 多轮复杂讨论 -- 成本优化 -- 上下文长度控制 - -## 功能特性 - -- :material-arrow-collapse-vertical: **智能压缩**:AI 驱动的上下文摘要 -- :material-clock-fast: **异步处理**:后台非阻塞压缩 -- :material-memory: **保留上下文**:尽量保留重要信息 -- :material-currency-usd-off: **降低成本**:减少 token 使用 -- :material-console: **前端调试**:支持浏览器控制台日志 -- :material-alert-circle-check: **增强错误报告**:清晰的错误状态通知 -- :material-check-all: **Open WebUI v0.7.x 兼容性**:动态数据库会话处理 -- :material-account-convert: **兼容性提升**:摘要角色改为 `assistant` -- :material-shield-check: **稳定性增强**:解决状态管理竞态条件 -- :material-ruler: **预检上下文检查**:发送前验证上下文是否超限 -- :material-format-align-justify: **结构感知裁剪**:保留文档结构的智能裁剪 -- :material-content-cut: **原生工具输出裁剪**:自动裁剪冗长的工具输出(注意:非原生工具调用输出不会完整注入上下文) -- :material-chart-bar: **详细 Token 日志**:提供细粒度的 Token 统计 -- :material-account-search: **智能模型匹配**:自定义模型自动继承基础模型配置 -- :material-image-off: **多模态支持**:图片内容保留但 Token **不参与计算** +详细的工作原理和流程请参考 [工作流程指南](WORKFLOW_GUIDE_CN.md)。 --- -## 安装 +## 安装与配置 -1. 下载插件文件:[`async_context_compression.py`](https://github.com/Fu-Jie/openwebui-extensions/tree/main/plugins/filters/async-context-compression) -2. 上传到 OpenWebUI:**Admin Panel** → **Settings** → **Functions** -3. 配置压缩参数 -4. 启用过滤器 +### 1. 数据库(自动) + +- 自动使用 Open WebUI 的共享数据库连接,**无需额外配置**。 +- 首次运行自动创建 `chat_summary` 表。 + +### 2. 过滤器顺序 + +- 建议顺序:前置过滤器(<10)→ 本过滤器(10)→ 后置过滤器(>10)。 --- -## 工作原理 +## 配置参数 -```mermaid -graph TD - A[Incoming Messages] --> B{Token Count > Threshold?} - B -->|No| C[Pass Through] - B -->|Yes| D[Summarize Older Messages] - D --> E[Preserve Recent Messages] - E --> F[Combine Summary + Recent] - F --> G[Send to LLM] +您可以在过滤器的设置中调整以下参数: + +### 核心参数 + +| 参数 | 默认值 | 描述 | +| :----------------------------- | :------- | :------------------------------------------------------------------------------------ | +| `priority` | `10` | 过滤器执行顺序,数值越小越先执行。 | +| `compression_threshold_tokens` | `64000` | **重要**: 当上下文总 Token 超过此值时后台生成摘要,建议设为模型上下文窗口的 50%-70%。 | +| `max_context_tokens` | `128000` | **重要**: 上下文硬上限,超过即移除最早消息(保留受保护消息)。 | +| `keep_first` | `1` | 始终保留对话开始的 N 条消息,保护系统提示或环境变量。 | +| `keep_last` | `6` | 始终保留对话末尾的 N 条消息,确保最近上下文连贯。 | + +### 摘要生成配置 + +| 参数 | 默认值 | 描述 | +| :-------------------- | :------ | :------------------------------------------------------------------------------------------------------------------------------------------ | +| `summary_model` | `None` | 用于生成摘要的模型 ID。**强烈建议**配置快速、经济、上下文窗口大的模型(如 `gemini-2.5-flash`、`deepseek-v3`)。留空则尝试复用当前对话模型。 | +| `summary_model_max_context` | `0` | 摘要模型的最大上下文 Token 数。如果为 0,则回退到 `model_thresholds` 或全局 `max_context_tokens`。 | +| `max_summary_tokens` | `16384` | 生成摘要时允许的最大 Token 数。 | +| `summary_temperature` | `0.1` | 控制摘要生成的随机性,较低的值结果更稳定。 | + +### 高级配置 + +#### `model_thresholds` (模型特定阈值) + +这是一个字典配置,可为特定模型 ID 覆盖全局 `compression_threshold_tokens` 与 `max_context_tokens`,适用于混合不同上下文窗口的模型。 + +**默认包含 GPT-4、Claude 3.5、Gemini 1.5/2.0、Qwen 2.5/3、DeepSeek V3 等推荐阈值。** + +**配置示例:** + +```json +{ + "gpt-4": { + "compression_threshold_tokens": 8000, + "max_context_tokens": 32000 + }, + "gemini-2.5-flash": { + "compression_threshold_tokens": 734000, + "max_context_tokens": 1048576 + } +} ``` ---- - -## 配置项 - -| 选项 | 类型 | 默认值 | 说明 | -|--------|------|---------|-------------| -| `compression_threshold_tokens` | integer | `64000` | 超过该 token 数触发压缩 | -| `max_context_tokens` | integer | `128000` | 上下文硬性上限 | -| `keep_first` | integer | `1` | 始终保留的前 N 条消息 | -| `keep_last` | integer | `6` | 始终保留的后 N 条消息 | -| `summary_model` | string | `None` | 用于摘要的模型 | -| `summary_model_max_context` | integer | `0` | 摘要模型的最大上下文 Token 数 | -| `max_summary_tokens` | integer | `16384` | 摘要的最大 token 数 | -| `enable_tool_output_trimming` | boolean | `false` | 启用长工具输出裁剪 | +| 参数 | 默认值 | 描述 | +| :----------------------------- | :------- | :-------------------------------------------------------------------------------------------------------------------------------------- | +| `enable_tool_output_trimming` | `false` | 启用时,若 `function_calling: "native"` 激活,将裁剪冗长的工具输出以仅提取最终答案。 | +| `debug_mode` | `false` | 是否在 Open WebUI 的控制台日志中打印详细的调试信息。生产环境默认且建议设为 `false`。 | +| `show_debug_log` | `false` | 是否在浏览器控制台 (F12) 打印调试日志。便于前端调试。 | +| `show_token_usage_status` | `true` | 是否在对话结束时显示 Token 使用情况的状态通知。 | +| `token_usage_status_threshold` | `80` | 触发显示上下文用量状态通知的最低百分比阈值 (0-100)。 | --- -## 示例 +## ⭐ 支持 -### 压缩前 +如果这个插件对你有帮助,欢迎到 [OpenWebUI Extensions](https://github.com/Fu-Jie/openwebui-extensions) 点个 Star,这将是我持续改进的动力,感谢支持。 -``` -[Message 1] User: Tell me about Python... -[Message 2] AI: Python is a programming language... -[Message 3] User: What about its history? -[Message 4] AI: Python was created by Guido... -[Message 5] User: And its features? -[Message 6] AI: Python has many features... -... (many more messages) -[Message 20] User: Current question -``` +## 故障排除 (Troubleshooting) ❓ -### 压缩后 +- **初始系统提示丢失**:将 `keep_first` 设置为大于 0。 +- **压缩效果不明显**:提高 `compression_threshold_tokens`,或降低 `keep_first` / `keep_last` 以增强压缩力度。 +- **提交 Issue**: 如果遇到任何问题,请在 GitHub 上提交 Issue:[OpenWebUI Extensions Issues](https://github.com/Fu-Jie/openwebui-extensions/issues) -``` -[Summary] Previous conversation covered Python basics, -history, features, and common use cases... +## 更新日志 -[Message 18] User: Recent question about decorators -[Message 19] AI: Decorators in Python are... -[Message 20] User: Current question -``` - ---- - -## 运行要求 - -!!! note "前置条件" - - OpenWebUI v0.3.0 及以上 - - 需要可用的 LLM 用于摘要 - -!!! tip "最佳实践" - - 根据模型上下文窗口设置合适的 token 阈值 - - 技术讨论可适当提高 `preserve_recent` - - 先在非关键对话中测试压缩效果 - ---- - -## 常见问题 - -??? question "没有触发压缩?" - 检查 token 数是否超过配置的阈值,并开启调试日志了解细节。 - -??? question "重要上下文丢失?" - 提高 `preserve_recent` 或降低压缩比例。 - ---- - -## 源码 - -[:fontawesome-brands-github: 在 GitHub 查看](https://github.com/Fu-Jie/openwebui-extensions/tree/main/plugins/filters/async-context-compression){ .md-button } +完整历史请查看 GitHub 项目: [OpenWebUI Extensions](https://github.com/Fu-Jie/openwebui-extensions) diff --git a/docs/plugins/filters/index.md b/docs/plugins/filters/index.md index cc0db66..2df8128 100644 --- a/docs/plugins/filters/index.md +++ b/docs/plugins/filters/index.md @@ -22,7 +22,7 @@ Filters act as middleware in the message pipeline: Reduces token consumption in long conversations through intelligent summarization while maintaining coherence. - **Version:** 1.2.2 + **Version:** 1.3.0 [:octicons-arrow-right-24: Documentation](async-context-compression.md) diff --git a/docs/plugins/filters/index.zh.md b/docs/plugins/filters/index.zh.md index 5e6eb56..60bfda7 100644 --- a/docs/plugins/filters/index.zh.md +++ b/docs/plugins/filters/index.zh.md @@ -22,7 +22,7 @@ Filter 充当消息管线中的中间件: 通过智能总结减少长对话的 token 消耗,同时保持连贯性。 - **版本:** 1.2.2 + **版本:** 1.3.0 [:octicons-arrow-right-24: 查看文档](async-context-compression.md) diff --git a/plugins/filters/async-context-compression/README.md b/plugins/filters/async-context-compression/README.md index 59aa86c..fe94989 100644 --- a/plugins/filters/async-context-compression/README.md +++ b/plugins/filters/async-context-compression/README.md @@ -1,18 +1,22 @@ # Async Context Compression Filter -**Author:** [Fu-Jie](https://github.com/Fu-Jie/openwebui-extensions) | **Version:** 1.2.2 | **Project:** [OpenWebUI Extensions](https://github.com/Fu-Jie/openwebui-extensions) | **License:** MIT +**Author:** [Fu-Jie](https://github.com/Fu-Jie/openwebui-extensions) | **Version:** 1.3.0 | **Project:** [OpenWebUI Extensions](https://github.com/Fu-Jie/openwebui-extensions) | **License:** MIT This filter reduces token consumption in long conversations through intelligent summarization and message compression while keeping conversations coherent. -## What's new in 1.2.2 +## What's new in 1.3.0 -- **Critical Fix**: Resolved `TypeError: 'str' object is not callable` caused by variable name conflict in logging function. -- **Compatibility**: Enhanced `params` handling to support Pydantic objects, improving compatibility with different OpenWebUI versions. +- **Internationalization (i18n)**: Complete localization of user-facing messages across 9 languages (English, Chinese, Japanese, Korean, French, German, Spanish, Italian). +- **Smart Status Display**: Added `token_usage_status_threshold` valve (default 80%) to intelligently control when token usage status is shown. +- **Improved Performance**: Frontend language detection and logging are optimized to be completely non-blocking, maintaining lightning-fast TTFB. +- **Copilot SDK Integration**: Automatically detects and skips compression for copilot_sdk based models to prevent conflicts. +- **Configuration**: `debug_mode` is now set to `false` by default for a quieter production experience. --- ## Core Features +- ✅ **Full i18n Support**: Native localization across 9 languages. - ✅ Automatic compression triggered by token thresholds. - ✅ Asynchronous summarization that does not block chat responses. - ✅ Persistent storage via Open WebUI's shared database connection (PostgreSQL, SQLite, etc.). @@ -55,8 +59,10 @@ This filter reduces token consumption in long conversations through intelligent | `summary_temperature` | `0.3` | Randomness for summary generation. Lower is more deterministic. | | `model_thresholds` | `{}` | Per-model overrides for `compression_threshold_tokens` and `max_context_tokens` (useful for mixed models). | | `enable_tool_output_trimming` | `false` | When enabled and `function_calling: "native"` is active, trims verbose tool outputs to extract only the final answer. | -| `debug_mode` | `true` | Log verbose debug info. Set to `false` in production. | +| `debug_mode` | `false` | Log verbose debug info. Set to `false` in production. | | `show_debug_log` | `false` | Print debug logs to browser console (F12). Useful for frontend debugging. | +| `show_token_usage_status` | `true` | Show token usage status notification in the chat interface. | +| `token_usage_status_threshold` | `80` | The minimum usage percentage (0-100) required to show a context usage status notification. | --- diff --git a/plugins/filters/async-context-compression/README_CN.md b/plugins/filters/async-context-compression/README_CN.md index e2860f9..797f19c 100644 --- a/plugins/filters/async-context-compression/README_CN.md +++ b/plugins/filters/async-context-compression/README_CN.md @@ -1,20 +1,24 @@ # 异步上下文压缩过滤器 -**作者:** [Fu-Jie](https://github.com/Fu-Jie/openwebui-extensions) | **版本:** 1.2.2 | **项目:** [OpenWebUI Extensions](https://github.com/Fu-Jie/openwebui-extensions) | **许可证:** MIT +**作者:** [Fu-Jie](https://github.com/Fu-Jie/openwebui-extensions) | **版本:** 1.3.0 | **项目:** [OpenWebUI Extensions](https://github.com/Fu-Jie/openwebui-extensions) | **许可证:** MIT > **重要提示**:为了确保所有过滤器的可维护性和易用性,每个过滤器都应附带清晰、完整的文档,以确保其功能、配置和使用方法得到充分说明。 本过滤器通过智能摘要和消息压缩技术,在保持对话连贯性的同时,显著降低长对话的 Token 消耗。 -## 1.2.2 版本更新 +## 1.3.0 版本更新 -- **严重错误修复**: 解决了因日志函数变量名冲突导致的 `TypeError: 'str' object is not callable` 错误。 -- **兼容性增强**: 改进了 `params` 处理逻辑以支持 Pydantic 对象,提高了对不同 OpenWebUI 版本的兼容性。 +- **国际化 (i18n) 支持**: 完成了所有用户可见消息的本地化,现已原生支持 9 种语言(含中、英、日、韩及欧洲主要语言)。 +- **智能状态显示**: 新增 `token_usage_status_threshold` 阀门(默认 80%),可以智能控制何时显示 Token 用量状态,减少不必要的打扰。 +- **性能大幅优化**: 对前端语言检测和日志处理流程进行了非阻塞重构,完全不影响首字节响应时间(TTFB),保持毫秒级极速推流。 +- **Copilot SDK 兼容**: 自动检测并跳过基于 `copilot_sdk` 模型的上下文压缩,避免冲突。 +- **配置项调整**: 为了提供更安静的生产环境体验,`debug_mode` 现已默认设置为 `false`。 --- ## 核心特性 +- ✅ **全方位国际化**: 原生支持 9 种界面语言。 - ✅ **自动压缩**: 基于 Token 阈值自动触发上下文压缩。 - ✅ **异步摘要**: 后台生成摘要,不阻塞当前对话响应。 - ✅ **持久化存储**: 复用 Open WebUI 共享数据库连接,自动支持 PostgreSQL/SQLite 等。 @@ -93,9 +97,10 @@ | 参数 | 默认值 | 描述 | | :----------------------------- | :------- | :-------------------------------------------------------------------------------------------------------------------------------------- | | `enable_tool_output_trimming` | `false` | 启用时,若 `function_calling: "native"` 激活,将裁剪冗长的工具输出以仅提取最终答案。 | -| `debug_mode` | `true` | 是否在 Open WebUI 的控制台日志中打印详细的调试信息(如 Token 计数、压缩进度、数据库操作等)。生产环境建议设为 `false`。 | +| `debug_mode` | `false` | 是否在 Open WebUI 的控制台日志中打印详细的调试信息。生产环境默认且建议设为 `false`。 | | `show_debug_log` | `false` | 是否在浏览器控制台 (F12) 打印调试日志。便于前端调试。 | | `show_token_usage_status` | `true` | 是否在对话结束时显示 Token 使用情况的状态通知。 | +| `token_usage_status_threshold` | `80` | 触发显示上下文用量状态通知的最低百分比阈值 (0-100)。 | --- diff --git a/plugins/filters/async-context-compression/async_context_compression.py b/plugins/filters/async-context-compression/async_context_compression.py index 5070726..e59724c 100644 --- a/plugins/filters/async-context-compression/async_context_compression.py +++ b/plugins/filters/async-context-compression/async_context_compression.py @@ -5,17 +5,17 @@ author: Fu-Jie author_url: https://github.com/Fu-Jie/openwebui-extensions funding_url: https://github.com/open-webui description: Reduces token consumption in long conversations while maintaining coherence through intelligent summarization and message compression. -version: 1.2.2 +version: 1.3.0 openwebui_id: b1655bc8-6de9-4cad-8cb5-a6f7829a02ce license: MIT ═══════════════════════════════════════════════════════════════════════════════ -📌 What's new in 1.2.1 +📌 What's new in 1.3.0 ═══════════════════════════════════════════════════════════════════════════════ - ✅ Smart Configuration: Automatically detects base model settings for custom models and adds `summary_model_max_context` for independent summary limits. - ✅ Performance & Refactoring: Optimized threshold parsing with caching and removed redundant code for better efficiency. - ✅ Bug Fixes & Modernization: Fixed `datetime` deprecation warnings and corrected type annotations. + ✅ Smart Status Display: Added `token_usage_status_threshold` valve (default 80%) to control when token usage status is shown, reducing unnecessary notifications. + ✅ Copilot SDK Integration: Automatically detects and skips compression for copilot_sdk based models to prevent conflicts. + ✅ Improved User Experience: Status messages now only appear when token usage exceeds the configured threshold, keeping the interface cleaner. ═══════════════════════════════════════════════════════════════════════════════ 📌 Overview @@ -150,7 +150,7 @@ summary_temperature Description: Controls the randomness of the summary generation. Lower values produce more deterministic output. debug_mode - Default: true + Default: false Description: Prints detailed debug information to the log. Recommended to set to `false` in production. show_debug_log @@ -268,6 +268,7 @@ import hashlib import time import contextlib import logging +from functools import lru_cache # Setup logger logger = logging.getLogger(__name__) @@ -391,6 +392,130 @@ class ChatSummary(owui_Base): ) +TRANSLATIONS = { + "en-US": { + "status_context_usage": "Context Usage (Estimated): {tokens} / {max_tokens} Tokens ({ratio}%)", + "status_high_usage": " | ⚠️ High Usage", + "status_loaded_summary": "Loaded historical summary (Hidden {count} historical messages)", + "status_context_summary_updated": "Context Summary Updated: {tokens} / {max_tokens} Tokens ({ratio}%)", + "status_generating_summary": "Generating context summary in background...", + "status_summary_error": "Summary Error: {error}", + "summary_prompt_prefix": "【Previous Summary: The following is a summary of the historical conversation, provided for context only. Do not reply to the summary content itself; answer the subsequent latest questions directly.】\n\n", + "summary_prompt_suffix": "\n\n---\nBelow is the recent conversation:", + "tool_trimmed": "... [Tool outputs trimmed]\n{content}", + "content_collapsed": "\n... [Content collapsed] ...\n", + }, + "zh-CN": { + "status_context_usage": "上下文用量 (预估): {tokens} / {max_tokens} Tokens ({ratio}%)", + "status_high_usage": " | ⚠️ 用量较高", + "status_loaded_summary": "已加载历史总结 (隐藏了 {count} 条历史消息)", + "status_context_summary_updated": "上下文总结已更新: {tokens} / {max_tokens} Tokens ({ratio}%)", + "status_generating_summary": "正在后台生成上下文总结...", + "status_summary_error": "总结生成错误: {error}", + "summary_prompt_prefix": "【前情提要:以下是历史对话的总结,仅供上下文参考。请不要回复总结内容本身,直接回答之后最新的问题。】\n\n", + "summary_prompt_suffix": "\n\n---\n以下是最近的对话:", + "tool_trimmed": "... [工具输出已裁剪]\n{content}", + "content_collapsed": "\n... [内容已折叠] ...\n", + }, + "zh-HK": { + "status_context_usage": "上下文用量 (預估): {tokens} / {max_tokens} Tokens ({ratio}%)", + "status_high_usage": " | ⚠️ 用量較高", + "status_loaded_summary": "已載入歷史總結 (隱藏了 {count} 條歷史訊息)", + "status_context_summary_updated": "上下文總結已更新: {tokens} / {max_tokens} Tokens ({ratio}%)", + "status_generating_summary": "正在後台生成上下文總結...", + "status_summary_error": "總結生成錯誤: {error}", + "summary_prompt_prefix": "【前情提要:以下是歷史對話的總結,僅供上下文參考。請不要回覆總結內容本身,直接回答之後最新的問題。】\n\n", + "summary_prompt_suffix": "\n\n---\n以下是最近的對話:", + "tool_trimmed": "... [工具輸出已裁剪]\n{content}", + "content_collapsed": "\n... [內容已折疊] ...\n", + }, + "zh-TW": { + "status_context_usage": "上下文用量 (預估): {tokens} / {max_tokens} Tokens ({ratio}%)", + "status_high_usage": " | ⚠️ 用量較高", + "status_loaded_summary": "已載入歷史總結 (隱藏了 {count} 條歷史訊息)", + "status_context_summary_updated": "上下文總結已更新: {tokens} / {max_tokens} Tokens ({ratio}%)", + "status_generating_summary": "正在後台生成上下文總結...", + "status_summary_error": "總結生成錯誤: {error}", + "summary_prompt_prefix": "【前情提要:以下是歷史對話的總結,僅供上下文参考。請不要回覆總結內容本身,直接回答之後最新的問題。】\n\n", + "summary_prompt_suffix": "\n\n---\n以下是最近的對話:", + "tool_trimmed": "... [工具輸出已裁剪]\n{content}", + "content_collapsed": "\n... [內容已折疊] ...\n", + }, + "ja-JP": { + "status_context_usage": "コンテキスト使用量 (推定): {tokens} / {max_tokens} トークン ({ratio}%)", + "status_high_usage": " | ⚠️ 使用量高", + "status_loaded_summary": "履歴の要約を読み込みました ({count} 件の履歴メッセージを非表示)", + "status_context_summary_updated": "コンテキストの要約が更新されました: {tokens} / {max_tokens} トークン ({ratio}%)", + "status_generating_summary": "バックグラウンドでコンテキスト要約を生成しています...", + "status_summary_error": "要約エラー: {error}", + "summary_prompt_prefix": "【これまでのあらすじ:以下は過去の会話の要約であり、コンテキストの参考としてのみ提供されます。要約の内容自体には返答せず、その後の最新の質問に直接答えてください。】\n\n", + "summary_prompt_suffix": "\n\n---\n以下は最近の会話です:", + "tool_trimmed": "... [ツールの出力をトリミングしました]\n{content}", + "content_collapsed": "\n... [コンテンツが折りたたまれました] ...\n", + }, + "ko-KR": { + "status_context_usage": "컨텍스트 사용량 (예상): {tokens} / {max_tokens} 토큰 ({ratio}%)", + "status_high_usage": " | ⚠️ 사용량 높음", + "status_loaded_summary": "이전 요약 불러옴 ({count}개의 이전 메시지 숨김)", + "status_context_summary_updated": "컨텍스트 요약 업데이트됨: {tokens} / {max_tokens} 토큰 ({ratio}%)", + "status_generating_summary": "백그라운드에서 컨텍스트 요약 생성 중...", + "status_summary_error": "요약 오류: {error}", + "summary_prompt_prefix": "【이전 요약: 다음은 이전 대화의 요약이며 문맥 참고용으로만 제공됩니다. 요약 내용 자체에 답하지 말고 последу의 최신 질문에 직접 답하세요.】\n\n", + "summary_prompt_suffix": "\n\n---\n다음은 최근 대화입니다:", + "tool_trimmed": "... [도구 출력 잘림]\n{content}", + "content_collapsed": "\n... [내용 접힘] ...\n", + }, + "fr-FR": { + "status_context_usage": "Utilisation du contexte (estimée) : {tokens} / {max_tokens} jetons ({ratio}%)", + "status_high_usage": " | ⚠️ Utilisation élevée", + "status_loaded_summary": "Résumé historique chargé ({count} messages d'historique masqués)", + "status_context_summary_updated": "Résumé du contexte mis à jour : {tokens} / {max_tokens} jetons ({ratio}%)", + "status_generating_summary": "Génération du résumé du contexte en arrière-plan...", + "status_summary_error": "Erreur de résumé : {error}", + "summary_prompt_prefix": "【Résumé précédent : Ce qui suit est un résumé de la conversation historique, fourni uniquement pour le contexte. Ne répondez pas au contenu du résumé lui-même ; répondez directement aux dernières questions.】\n\n", + "summary_prompt_suffix": "\n\n---\nVoici la conversation récente :", + "tool_trimmed": "... [Sorties d'outils coupées]\n{content}", + "content_collapsed": "\n... [Contenu réduit] ...\n", + }, + "de-DE": { + "status_context_usage": "Kontextnutzung (geschätzt): {tokens} / {max_tokens} Tokens ({ratio}%)", + "status_high_usage": " | ⚠️ Hohe Nutzung", + "status_loaded_summary": "Historische Zusammenfassung geladen ({count} historische Nachrichten ausgeblendet)", + "status_context_summary_updated": "Kontextzusammenfassung aktualisiert: {tokens} / {max_tokens} Tokens ({ratio}%)", + "status_generating_summary": "Kontextzusammenfassung wird im Hintergrund generiert...", + "status_summary_error": "Zusammenfassungsfehler: {error}", + "summary_prompt_prefix": "【Vorherige Zusammenfassung: Das Folgende ist eine Zusammenfassung der historischen Konversation, die nur als Kontext dient. Antworten Sie nicht auf den Inhalt der Zusammenfassung selbst, sondern direkt auf die nachfolgenden neuesten Fragen.】\n\n", + "summary_prompt_suffix": "\n\n---\nHier ist die jüngste Konversation:", + "tool_trimmed": "... [Werkzeugausgaben gekürzt]\n{content}", + "content_collapsed": "\n... [Inhalt ausgeblendet] ...\n", + }, + "es-ES": { + "status_context_usage": "Uso del contexto (estimado): {tokens} / {max_tokens} Tokens ({ratio}%)", + "status_high_usage": " | ⚠️ Uso elevado", + "status_loaded_summary": "Resumen histórico cargado ({count} mensajes históricos ocultos)", + "status_context_summary_updated": "Resumen del contexto actualizado: {tokens} / {max_tokens} Tokens ({ratio}%)", + "status_generating_summary": "Generando resumen del contexto en segundo plano...", + "status_summary_error": "Error de resumen: {error}", + "summary_prompt_prefix": "【Resumen anterior: El siguiente es un resumen de la conversación histórica, proporcionado solo como contexto. No responda al contenido del resumen en sí; responda directamente a las preguntas más recientes.】\n\n", + "summary_prompt_suffix": "\n\n---\nA continuación se muestra la conversación reciente:", + "tool_trimmed": "... [Salidas de herramientas recortadas]\n{content}", + "content_collapsed": "\n... [Contenido contraído] ...\n", + }, + "it-IT": { + "status_context_usage": "Utilizzo contesto (stimato): {tokens} / {max_tokens} Token ({ratio}%)", + "status_high_usage": " | ⚠️ Utilizzo elevato", + "status_loaded_summary": "Riepilogo storico caricato ({count} messaggi storici nascosti)", + "status_context_summary_updated": "Riepilogo contesto aggiornato: {tokens} / {max_tokens} Token ({ratio}%)", + "status_generating_summary": "Generazione riepilogo contesto in background...", + "status_summary_error": "Errore riepilogo: {error}", + "summary_prompt_prefix": "【Riepilogo precedente: Il seguente è un riepilogo della conversazione storica, fornito solo per contesto. Non rispondere al contenuto del riepilogo stesso; rispondi direttamente alle domande più recenti.】\n\n", + "summary_prompt_suffix": "\n\n---\nDi seguito è riportata la conversazione recente:", + "tool_trimmed": "... [Output degli strumenti tagliati]\n{content}", + "content_collapsed": "\n... [Contenuto compresso] ...\n", + }, +} + + # Global cache for tiktoken encoding TIKTOKEN_ENCODING = None if tiktoken: @@ -400,6 +525,26 @@ if tiktoken: logger.error(f"[Init] Failed to load tiktoken encoding: {e}") +@lru_cache(maxsize=1024) +def _get_cached_tokens(text: str) -> int: + """Calculates tokens with LRU caching for exact string matches.""" + if not text: + return 0 + if TIKTOKEN_ENCODING: + try: + # tiktoken logic is relatively fast, but caching it based on exact string match + # turns O(N) encoding time to O(1) dictionary lookup for historical messages. + return len(TIKTOKEN_ENCODING.encode(text)) + except Exception as e: + logger.warning( + f"[Token Count] tiktoken error: {e}, falling back to character estimation" + ) + pass + + # Fallback strategy: Rough estimation (1 token ≈ 4 chars) + return len(text) // 4 + + class Filter: def __init__(self): self.valves = self.Valves() @@ -409,8 +554,105 @@ class Filter: sessionmaker(bind=self._db_engine) if self._db_engine else None ) self._model_thresholds_cache: Optional[Dict[str, Any]] = None + + # Fallback mapping for variants not in TRANSLATIONS keys + self.fallback_map = { + "es-AR": "es-ES", + "es-MX": "es-ES", + "fr-CA": "fr-FR", + "en-CA": "en-US", + "en-GB": "en-US", + "en-AU": "en-US", + "de-AT": "de-DE", + } + self._init_database() + def _resolve_language(self, lang: str) -> str: + """Resolve the best matching language code from the TRANSLATIONS dict.""" + target_lang = lang + + # 1. Direct match + if target_lang in TRANSLATIONS: + return target_lang + + # 2. Variant fallback (explicit mapping) + if target_lang in self.fallback_map: + target_lang = self.fallback_map[target_lang] + if target_lang in TRANSLATIONS: + return target_lang + + # 3. Base language fallback (e.g. fr-BE -> fr-FR) + if "-" in lang: + base_lang = lang.split("-")[0] + for supported_lang in TRANSLATIONS: + if supported_lang.startswith(base_lang + "-"): + return supported_lang + + # 4. Final Fallback to en-US + return "en-US" + + def _get_translation(self, lang: str, key: str, **kwargs) -> str: + """Get translated string for the given language and key.""" + target_lang = self._resolve_language(lang) + lang_dict = TRANSLATIONS.get(target_lang, TRANSLATIONS["en-US"]) + text = lang_dict.get(key, TRANSLATIONS["en-US"].get(key, key)) + if kwargs: + try: + text = text.format(**kwargs) + except Exception as e: + logger.warning(f"Translation formatting failed for {key}: {e}") + return text + + async def _get_user_context( + self, + __user__: Optional[Dict[str, Any]], + __event_call__: Optional[Callable[[Any], Awaitable[None]]] = None, + ) -> Dict[str, str]: + """Extract basic user context with safe fallbacks.""" + if isinstance(__user__, (list, tuple)): + user_data = __user__[0] if __user__ else {} + elif isinstance(__user__, dict): + user_data = __user__ + else: + user_data = {} + + user_id = user_data.get("id", "unknown_user") + user_name = user_data.get("name", "User") + user_language = user_data.get("language", "en-US") + + if __event_call__: + try: + js_code = """ + return ( + document.documentElement.lang || + localStorage.getItem('locale') || + localStorage.getItem('language') || + navigator.language || + 'en-US' + ); + """ + frontend_lang = await asyncio.wait_for( + __event_call__({"type": "execute", "data": {"code": js_code}}), + timeout=1.0, + ) + if frontend_lang and isinstance(frontend_lang, str): + user_language = frontend_lang + except asyncio.TimeoutError: + logger.warning( + "Failed to retrieve frontend language: Timeout (using fallback)" + ) + except Exception as e: + logger.warning( + f"Failed to retrieve frontend language: {type(e).__name__}: {e}" + ) + + return { + "user_id": user_id, + "user_name": user_name, + "user_language": user_language, + } + def _parse_model_thresholds(self) -> Dict[str, Any]: """Parse model_thresholds string into a dictionary. @@ -574,7 +816,7 @@ class Filter: description="The temperature for summary generation.", ) debug_mode: bool = Field( - default=True, description="Enable detailed logging for debugging." + default=False, description="Enable detailed logging for debugging." ) show_debug_log: bool = Field( default=False, description="Show debug logs in the frontend console" @@ -582,6 +824,12 @@ class Filter: show_token_usage_status: bool = Field( default=True, description="Show token usage status notification" ) + token_usage_status_threshold: int = Field( + default=80, + ge=0, + le=100, + description="Only show token usage status when usage exceeds this percentage (0-100). Set to 0 to always show.", + ) enable_tool_output_trimming: bool = Field( default=False, description="Enable trimming of large tool outputs (only works with native function calling).", @@ -654,20 +902,7 @@ class Filter: def _count_tokens(self, text: str) -> int: """Counts the number of tokens in the text.""" - if not text: - return 0 - - if TIKTOKEN_ENCODING: - try: - return len(TIKTOKEN_ENCODING.encode(text)) - except Exception as e: - if self.valves.debug_mode: - logger.warning( - f"[Token Count] tiktoken error: {e}, falling back to character estimation" - ) - - # Fallback strategy: Rough estimation (1 token ≈ 4 chars) - return len(text) // 4 + return _get_cached_tokens(text) def _calculate_messages_tokens(self, messages: List[Dict]) -> int: """Calculates the total tokens for a list of messages.""" @@ -693,6 +928,20 @@ class Filter: return total_tokens + def _estimate_messages_tokens(self, messages: List[Dict]) -> int: + """Fast estimation of tokens based on character count (1/4 ratio).""" + total_chars = 0 + for msg in messages: + content = msg.get("content", "") + if isinstance(content, list): + for part in content: + if isinstance(part, dict) and part.get("type") == "text": + total_chars += len(part.get("text", "")) + else: + total_chars += len(str(content)) + + return total_chars // 4 + def _get_model_thresholds(self, model_id: str) -> Dict[str, int]: """Gets threshold configuration for a specific model. @@ -830,11 +1079,13 @@ class Filter: }})(); """ - await __event_call__( - { - "type": "execute", - "data": {"code": js_code}, - } + asyncio.create_task( + __event_call__( + { + "type": "execute", + "data": {"code": js_code}, + } + ) ) except Exception as e: logger.error(f"Error emitting debug log: {e}") @@ -876,17 +1127,55 @@ class Filter: js_code = f""" console.log("%c[Compression] {safe_message}", "{css}"); """ - # Add timeout to prevent blocking if frontend connection is broken - await asyncio.wait_for( - event_call({"type": "execute", "data": {"code": js_code}}), - timeout=2.0, - ) - except asyncio.TimeoutError: - logger.warning( - f"Failed to emit log to frontend: Timeout (connection may be broken)" + asyncio.create_task( + event_call({"type": "execute", "data": {"code": js_code}}) ) except Exception as e: - logger.error(f"Failed to emit log to frontend: {type(e).__name__}: {e}") + logger.error( + f"Failed to process log to frontend: {type(e).__name__}: {e}" + ) + + def _should_show_status(self, usage_ratio: float) -> bool: + """ + Check if token usage status should be shown based on threshold. + + Args: + usage_ratio: Current usage ratio (0.0 to 1.0) + + Returns: + True if status should be shown, False otherwise + """ + if not self.valves.show_token_usage_status: + return False + + # If threshold is 0, always show + if self.valves.token_usage_status_threshold == 0: + return True + + # Check if usage exceeds threshold + threshold_ratio = self.valves.token_usage_status_threshold / 100.0 + return usage_ratio >= threshold_ratio + + def _should_skip_compression( + self, body: dict, __model__: Optional[dict] = None + ) -> bool: + """ + Check if compression should be skipped. + Returns True if: + 1. The base model includes 'copilot_sdk' + """ + # Check if base model includes copilot_sdk + if __model__: + base_model_id = __model__.get("base_model_id", "") + if "copilot_sdk" in base_model_id.lower(): + return True + + # Also check model in body + model_id = body.get("model", "") + if "copilot_sdk" in model_id.lower(): + return True + + return False async def inlet( self, @@ -903,6 +1192,19 @@ class Filter: Compression Strategy: Only responsible for injecting existing summaries, no Token calculation. """ + # Check if compression should be skipped (e.g., for copilot_sdk) + if self._should_skip_compression(body, __model__): + if self.valves.debug_mode: + logger.info( + "[Inlet] Skipping compression: copilot_sdk detected in base model" + ) + if self.valves.show_debug_log and __event_call__: + await self._log( + "[Inlet] ⏭️ Skipping compression: copilot_sdk detected", + event_call=__event_call__, + ) + return body + messages = body.get("messages", []) # --- Native Tool Output Trimming (Opt-in, only for native function calling) --- @@ -966,8 +1268,14 @@ class Filter: final_answer = content[last_match_end:].strip() if final_answer: - msg["content"] = ( - f"... [Tool outputs trimmed]\n{final_answer}" + msg["content"] = self._get_translation( + ( + __user__.get("language", "en-US") + if __user__ + else "en-US" + ), + "tool_trimmed", + content=final_answer, ) trimmed_count += 1 else: @@ -980,8 +1288,14 @@ class Filter: if len(parts) > 1: final_answer = parts[-1].strip() if final_answer: - msg["content"] = ( - f"... [Tool outputs trimmed]\n{final_answer}" + msg["content"] = self._get_translation( + ( + __user__.get("language", "en-US") + if __user__ + else "en-US" + ), + "tool_trimmed", + content=final_answer, ) trimmed_count += 1 @@ -1173,6 +1487,10 @@ class Filter: # Target is to compress up to the (total - keep_last) message target_compressed_count = max(0, len(messages) - self.valves.keep_last) + # Get user context for i18n + user_ctx = await self._get_user_context(__user__, __event_call__) + lang = user_ctx["user_language"] + await self._log( f"[Inlet] Recorded target compression progress: {target_compressed_count}", event_call=__event_call__, @@ -1207,10 +1525,9 @@ class Filter: # 2. Summary message (Inserted as Assistant message) summary_content = ( - f"【Previous Summary: The following is a summary of the historical conversation, provided for context only. Do not reply to the summary content itself; answer the subsequent latest questions directly.】\n\n" - f"{summary_record.summary}\n\n" - f"---\n" - f"Below is the recent conversation:" + self._get_translation(lang, "summary_prompt_prefix") + + f"{summary_record.summary}" + + self._get_translation(lang, "summary_prompt_suffix") ) summary_msg = {"role": "assistant", "content": summary_content} @@ -1249,16 +1566,27 @@ class Filter: "max_context_tokens", self.valves.max_context_tokens ) - # Calculate total tokens - total_tokens = await asyncio.to_thread( - self._calculate_messages_tokens, calc_messages - ) + # --- Fast Estimation Check --- + estimated_tokens = self._estimate_messages_tokens(calc_messages) - # Preflight Check Log - await self._log( - f"[Inlet] 🔎 Preflight Check: {total_tokens}t / {max_context_tokens}t ({(total_tokens/max_context_tokens*100):.1f}%)", - event_call=__event_call__, - ) + # Since this is a hard limit check, only skip precise calculation if we are far below it (margin of 15%) + if estimated_tokens < max_context_tokens * 0.85: + total_tokens = estimated_tokens + await self._log( + f"[Inlet] 🔎 Fast Preflight Check (Est): {total_tokens}t / {max_context_tokens}t (Well within limit)", + event_call=__event_call__, + ) + else: + # Calculate exact total tokens via tiktoken + total_tokens = await asyncio.to_thread( + self._calculate_messages_tokens, calc_messages + ) + + # Preflight Check Log + await self._log( + f"[Inlet] 🔎 Precise Preflight Check: {total_tokens}t / {max_context_tokens}t ({(total_tokens/max_context_tokens*100):.1f}%)", + event_call=__event_call__, + ) # If over budget, reduce history (Keep Last) if total_tokens > max_context_tokens: @@ -1325,7 +1653,9 @@ class Filter: first_line_found = True # Add placeholder if there's more content coming if idx < last_line_idx: - kept_lines.append("\n... [Content collapsed] ...\n") + kept_lines.append( + self._get_translation(lang, "content_collapsed") + ) continue # Keep last non-empty line @@ -1347,8 +1677,13 @@ class Filter: target_msg["metadata"]["is_trimmed"] = True # Calculate token reduction - old_tokens = self._count_tokens(content) - new_tokens = self._count_tokens(target_msg["content"]) + # Use current token strategy + if total_tokens == estimated_tokens: + old_tokens = len(content) // 4 + new_tokens = len(target_msg["content"]) // 4 + else: + old_tokens = self._count_tokens(content) + new_tokens = self._count_tokens(target_msg["content"]) diff = old_tokens - new_tokens total_tokens -= diff @@ -1362,7 +1697,12 @@ class Filter: # Strategy 2: Fallback - Drop Oldest Message Entirely (FIFO) # (User requested to remove progressive trimming for other cases) dropped = tail_messages.pop(0) - dropped_tokens = self._count_tokens(str(dropped.get("content", ""))) + if total_tokens == estimated_tokens: + dropped_tokens = len(str(dropped.get("content", ""))) // 4 + else: + dropped_tokens = self._count_tokens( + str(dropped.get("content", "")) + ) total_tokens -= dropped_tokens if self.valves.show_debug_log and __event_call__: @@ -1382,14 +1722,24 @@ class Filter: final_messages = candidate_messages # Calculate detailed token stats for logging - system_tokens = ( - self._count_tokens(system_prompt_msg.get("content", "")) - if system_prompt_msg - else 0 - ) - head_tokens = self._calculate_messages_tokens(head_messages) - summary_tokens = self._count_tokens(summary_content) - tail_tokens = self._calculate_messages_tokens(tail_messages) + if total_tokens == estimated_tokens: + system_tokens = ( + len(system_prompt_msg.get("content", "")) // 4 + if system_prompt_msg + else 0 + ) + head_tokens = self._estimate_messages_tokens(head_messages) + summary_tokens = len(summary_content) // 4 + tail_tokens = self._estimate_messages_tokens(tail_messages) + else: + system_tokens = ( + self._count_tokens(system_prompt_msg.get("content", "")) + if system_prompt_msg + else 0 + ) + head_tokens = self._calculate_messages_tokens(head_messages) + summary_tokens = self._count_tokens(summary_content) + tail_tokens = self._calculate_messages_tokens(tail_messages) system_info = ( f"System({system_tokens}t)" if system_prompt_msg else "System(0t)" @@ -1408,22 +1758,43 @@ class Filter: # Prepare status message (Context Usage format) if max_context_tokens > 0: usage_ratio = total_section_tokens / max_context_tokens - status_msg = f"Context Usage (Estimated): {total_section_tokens} / {max_context_tokens} Tokens ({usage_ratio*100:.1f}%)" - if usage_ratio > 0.9: - status_msg += " | ⚠️ High Usage" - else: - status_msg = f"Loaded historical summary (Hidden {compressed_count} historical messages)" + # Only show status if threshold is met + if self._should_show_status(usage_ratio): + status_msg = self._get_translation( + lang, + "status_context_usage", + tokens=total_section_tokens, + max_tokens=max_context_tokens, + ratio=f"{usage_ratio*100:.1f}", + ) + if usage_ratio > 0.9: + status_msg += self._get_translation(lang, "status_high_usage") - if __event_emitter__: - await __event_emitter__( - { - "type": "status", - "data": { - "description": status_msg, - "done": True, - }, - } - ) + if __event_emitter__: + await __event_emitter__( + { + "type": "status", + "data": { + "description": status_msg, + "done": True, + }, + } + ) + else: + # For the case where max_context_tokens is 0, show summary info without threshold check + if self.valves.show_token_usage_status and __event_emitter__: + status_msg = self._get_translation( + lang, "status_loaded_summary", count=compressed_count + ) + await __event_emitter__( + { + "type": "status", + "data": { + "description": status_msg, + "done": True, + }, + } + ) # Emit debug log to frontend (Keep the structured log as well) await self._emit_debug_log( @@ -1454,9 +1825,20 @@ class Filter: "max_context_tokens", self.valves.max_context_tokens ) - total_tokens = await asyncio.to_thread( - self._calculate_messages_tokens, calc_messages - ) + # --- Fast Estimation Check --- + estimated_tokens = self._estimate_messages_tokens(calc_messages) + + # Only skip precise calculation if we are clearly below the limit + if estimated_tokens < max_context_tokens * 0.85: + total_tokens = estimated_tokens + await self._log( + f"[Inlet] 🔎 Fast limit check (Est): {total_tokens}t / {max_context_tokens}t", + event_call=__event_call__, + ) + else: + total_tokens = await asyncio.to_thread( + self._calculate_messages_tokens, calc_messages + ) if total_tokens > max_context_tokens: await self._log( @@ -1476,7 +1858,12 @@ class Filter: > start_trim_index + 1 # Keep at least 1 message after keep_first ): dropped = final_messages.pop(start_trim_index) - dropped_tokens = self._count_tokens(str(dropped.get("content", ""))) + if total_tokens == estimated_tokens: + dropped_tokens = len(str(dropped.get("content", ""))) // 4 + else: + dropped_tokens = self._count_tokens( + str(dropped.get("content", "")) + ) total_tokens -= dropped_tokens await self._log( @@ -1485,23 +1872,30 @@ class Filter: ) # Send status notification (Context Usage format) - if __event_emitter__: - status_msg = f"Context Usage (Estimated): {total_tokens} / {max_context_tokens} Tokens" - if max_context_tokens > 0: - usage_ratio = total_tokens / max_context_tokens - status_msg += f" ({usage_ratio*100:.1f}%)" + if max_context_tokens > 0: + usage_ratio = total_tokens / max_context_tokens + # Only show status if threshold is met + if self._should_show_status(usage_ratio): + status_msg = self._get_translation( + lang, + "status_context_usage", + tokens=total_tokens, + max_tokens=max_context_tokens, + ratio=f"{usage_ratio*100:.1f}", + ) if usage_ratio > 0.9: - status_msg += " | ⚠️ High Usage" + status_msg += self._get_translation(lang, "status_high_usage") - await __event_emitter__( - { - "type": "status", - "data": { - "description": status_msg, - "done": True, - }, - } - ) + if __event_emitter__: + await __event_emitter__( + { + "type": "status", + "data": { + "description": status_msg, + "done": True, + }, + } + ) body["messages"] = final_messages @@ -1517,6 +1911,7 @@ class Filter: body: dict, __user__: Optional[dict] = None, __metadata__: dict = None, + __model__: dict = None, __event_emitter__: Callable[[Any], Awaitable[None]] = None, __event_call__: Callable[[Any], Awaitable[None]] = None, ) -> dict: @@ -1524,6 +1919,23 @@ class Filter: Executed after the LLM response is complete. Calculates Token count in the background and triggers summary generation (does not block current response, does not affect content output). """ + # Check if compression should be skipped (e.g., for copilot_sdk) + if self._should_skip_compression(body, __model__): + if self.valves.debug_mode: + logger.info( + "[Outlet] Skipping compression: copilot_sdk detected in base model" + ) + if self.valves.show_debug_log and __event_call__: + await self._log( + "[Outlet] ⏭️ Skipping compression: copilot_sdk detected", + event_call=__event_call__, + ) + return body + + # Get user context for i18n + user_ctx = await self._get_user_context(__user__, __event_call__) + lang = user_ctx["user_language"] + chat_ctx = self._get_chat_context(body, __metadata__) chat_id = chat_ctx["chat_id"] if not chat_id: @@ -1547,6 +1959,7 @@ class Filter: body, __user__, target_compressed_count, + lang, __event_emitter__, __event_call__, ) @@ -1561,6 +1974,7 @@ class Filter: body: dict, user_data: Optional[dict], target_compressed_count: Optional[int], + lang: str = "en-US", __event_emitter__: Callable[[Any], Awaitable[None]] = None, __event_call__: Callable[[Any], Awaitable[None]] = None, ): @@ -1595,37 +2009,58 @@ class Filter: event_call=__event_call__, ) - # Calculate Token count in a background thread - current_tokens = await asyncio.to_thread( - self._calculate_messages_tokens, messages - ) + # --- Fast Estimation Check --- + estimated_tokens = self._estimate_messages_tokens(messages) - await self._log( - f"[🔍 Background Calculation] Token count: {current_tokens}", - event_call=__event_call__, - ) + # For triggering summary generation, we need to be more precise if we are in the grey zone + # Margin is 15% (skip tiktoken if estimated is < 85% of threshold) + # Note: We still use tiktoken if we exceed threshold, because we want an accurate usage status report + if estimated_tokens < compression_threshold_tokens * 0.85: + current_tokens = estimated_tokens + await self._log( + f"[🔍 Background Calculation] Fast estimate ({current_tokens}) is well below threshold ({compression_threshold_tokens}). Skipping tiktoken.", + event_call=__event_call__, + ) + else: + # Calculate Token count precisely in a background thread + current_tokens = await asyncio.to_thread( + self._calculate_messages_tokens, messages + ) + await self._log( + f"[🔍 Background Calculation] Precise token count: {current_tokens}", + event_call=__event_call__, + ) # Send status notification (Context Usage format) - if __event_emitter__ and self.valves.show_token_usage_status: + if __event_emitter__: max_context_tokens = thresholds.get( "max_context_tokens", self.valves.max_context_tokens ) - status_msg = f"Context Usage (Estimated): {current_tokens} / {max_context_tokens} Tokens" if max_context_tokens > 0: usage_ratio = current_tokens / max_context_tokens - status_msg += f" ({usage_ratio*100:.1f}%)" - if usage_ratio > 0.9: - status_msg += " | ⚠️ High Usage" + # Only show status if threshold is met + if self._should_show_status(usage_ratio): + status_msg = self._get_translation( + lang, + "status_context_usage", + tokens=current_tokens, + max_tokens=max_context_tokens, + ratio=f"{usage_ratio*100:.1f}", + ) + if usage_ratio > 0.9: + status_msg += self._get_translation( + lang, "status_high_usage" + ) - await __event_emitter__( - { - "type": "status", - "data": { - "description": status_msg, - "done": True, - }, - } - ) + await __event_emitter__( + { + "type": "status", + "data": { + "description": status_msg, + "done": True, + }, + } + ) # Check if compression is needed if current_tokens >= compression_threshold_tokens: @@ -1642,6 +2077,7 @@ class Filter: body, user_data, target_compressed_count, + lang, __event_emitter__, __event_call__, ) @@ -1672,6 +2108,7 @@ class Filter: body: dict, user_data: Optional[dict], target_compressed_count: Optional[int], + lang: str = "en-US", __event_emitter__: Callable[[Any], Awaitable[None]] = None, __event_call__: Callable[[Any], Awaitable[None]] = None, ): @@ -1811,7 +2248,9 @@ class Filter: { "type": "status", "data": { - "description": "Generating context summary in background...", + "description": self._get_translation( + lang, "status_generating_summary" + ), "done": False, }, } @@ -1849,7 +2288,11 @@ class Filter: { "type": "status", "data": { - "description": f"Context summary updated (Compressed {len(middle_messages)} messages)", + "description": self._get_translation( + lang, + "status_loaded_summary", + count=len(middle_messages), + ), "done": True, }, } @@ -1910,10 +2353,9 @@ class Filter: # Summary summary_content = ( - f"【System Prompt: The following is a summary of the historical conversation, provided for context only. Do not reply to the summary content itself; answer the subsequent latest questions directly.】\n\n" - f"{new_summary}\n\n" - f"---\n" - f"Below is the recent conversation:" + self._get_translation(lang, "summary_prompt_prefix") + + f"{new_summary}" + + self._get_translation(lang, "summary_prompt_suffix") ) summary_msg = {"role": "assistant", "content": summary_content} @@ -1943,23 +2385,32 @@ class Filter: max_context_tokens = thresholds.get( "max_context_tokens", self.valves.max_context_tokens ) - # 6. Emit Status - status_msg = f"Context Summary Updated: {token_count} / {max_context_tokens} Tokens" + # 6. Emit Status (only if threshold is met) if max_context_tokens > 0: - ratio = (token_count / max_context_tokens) * 100 - status_msg += f" ({ratio:.1f}%)" - if ratio > 90.0: - status_msg += " | ⚠️ High Usage" + usage_ratio = token_count / max_context_tokens + # Only show status if threshold is met + if self._should_show_status(usage_ratio): + status_msg = self._get_translation( + lang, + "status_context_summary_updated", + tokens=token_count, + max_tokens=max_context_tokens, + ratio=f"{usage_ratio*100:.1f}", + ) + if usage_ratio > 0.9: + status_msg += self._get_translation( + lang, "status_high_usage" + ) - await __event_emitter__( - { - "type": "status", - "data": { - "description": status_msg, - "done": True, - }, - } - ) + await __event_emitter__( + { + "type": "status", + "data": { + "description": status_msg, + "done": True, + }, + } + ) except Exception as e: await self._log( f"[Status] Error calculating tokens: {e}", @@ -1979,7 +2430,9 @@ class Filter: { "type": "status", "data": { - "description": f"Summary Error: {str(e)[:100]}...", + "description": self._get_translation( + lang, "status_summary_error", error=str(e)[:100] + ), "done": True, }, } diff --git a/plugins/filters/async-context-compression/async_context_compression_cn.py b/plugins/filters/async-context-compression/async_context_compression_cn.py deleted file mode 100644 index c8cf148..0000000 --- a/plugins/filters/async-context-compression/async_context_compression_cn.py +++ /dev/null @@ -1,2028 +0,0 @@ -""" -title: 异步上下文压缩 -id: async_context_compression -author: Fu-Jie -author_url: https://github.com/Fu-Jie/openwebui-extensions -funding_url: https://github.com/open-webui -description: 通过智能摘要和消息压缩,降低长对话的 token 消耗,同时保持对话连贯性。 -version: 1.2.2 -openwebui_id: 5c0617cb-a9e4-4bd6-a440-d276534ebd18 -license: MIT - -═══════════════════════════════════════════════════════════════════════════════ -📌 1.2.1 版本更新 -═══════════════════════════════════════════════════════════════════════════════ - - ✅ 智能配置增强:自动检测自定义模型的基础模型配置,并新增 `summary_model_max_context` 参数以独立控制摘要模型的上下文限制。 - ✅ 性能优化与重构:重构了阈值解析逻辑并增加缓存,移除了冗余的处理代码,并增强了 LLM 响应处理(支持 JSONResponse)。 - ✅ 稳定性改进:修复了 `datetime` 弃用警告,修正了类型注解,并将 print 语句替换为标准日志记录。 - -═══════════════════════════════════════════════════════════════════════════════ -📌 功能概述 -═══════════════════════════════════════════════════════════════════════════════ - -本过滤器通过智能摘要和消息压缩技术,显著降低长对话的 token 消耗,同时保持对话连贯性。 - -核心特性: - ✅ 自动触发压缩(基于 Token 数量阈值) - ✅ 异步生成摘要(不阻塞用户响应) - ✅ 数据库持久化存储(支持 PostgreSQL 和 SQLite) - ✅ 灵活的保留策略(可配置保留对话的头部和尾部) - ✅ 智能注入摘要,保持上下文连贯性 - -═══════════════════════════════════════════════════════════════════════════════ -🔄 工作流程 -═══════════════════════════════════════════════════════════════════════════════ - -阶段 1: inlet(请求前处理) -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - 1. 接收当前对话的所有消息。 - 2. 检查是否存在已保存的摘要。 - 3. 如果有摘要且消息数超过保留阈值: - ├─ 提取要保留的头部消息(例如,第一条消息)。 - ├─ 将摘要注入到头部消息中。 - ├─ 提取要保留的尾部消息。 - └─ 组合成新的消息列表:[头部消息+摘要] + [尾部消息]。 - 4. 发送压缩后的消息到 LLM。 - -阶段 2: outlet(响应后处理) -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - 1. LLM 响应完成后触发。 - 2. 检查 Token 数是否达到压缩阈值。 - 3. 如果达到 Token 阈值,则在后台异步生成摘要: - ├─ 提取需要摘要的消息(排除保留的头部和尾部)。 - ├─ 调用 LLM 生成简洁摘要。 - └─ 将摘要保存到数据库。 - -═══════════════════════════════════════════════════════════════════════════════ -💾 存储方案 -═══════════════════════════════════════════════════════════════════════════════ - -本过滤器使用 Open WebUI 的共享数据库连接进行持久化存储。 -它自动复用 Open WebUI 内部的 SQLAlchemy 引擎和 SessionLocal, -使插件与数据库类型无关,并确保与 Open WebUI 支持的任何数据库后端 -(PostgreSQL、SQLite 等)兼容。 - -无需额外的数据库配置 - 插件自动继承 Open WebUI 的数据库设置。 - - 表结构: - - id: 主键(自增) - - chat_id: 对话唯一标识(唯一索引) - - summary: 摘要内容(TEXT) - - compressed_message_count: 原始消息数 - - created_at: 创建时间 - - updated_at: 更新时间 - -═══════════════════════════════════════════════════════════════════════════════ -📊 压缩效果示例 -═══════════════════════════════════════════════════════════════════════════════ - -场景:20 条消息的对话 (默认设置: 保留前 1 条, 后 6 条) -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - 压缩前: - 消息 1: [初始设定 + 初始问题] - 消息 2-14: [历史对话内容] - 消息 15-20: [最近对话] - 总计: 20 条完整消息 - - 压缩后: - 消息 1: [初始设定 + 历史摘要 + 初始问题] - 消息 15-20: [最近 6 条完整消息] - 总计: 7 条消息 - - 效果: - ✓ 节省 13 条消息(约 65%) - ✓ 保留完整上下文信息 - ✓ 保护重要的初始设定 - -═══════════════════════════════════════════════════════════════════════════════ -⚙️ 配置参数说明 -═══════════════════════════════════════════════════════════════════════════════ - -priority (优先级) - 默认: 10 - 说明: 过滤器执行顺序,数值越小越先执行。 - -compression_threshold_tokens (压缩阈值 Token) - 默认: 64000 - 说明: 当上下文总 Token 数超过此值时,触发压缩。 - 建议: 根据模型上下文窗口和成本调整。 - -max_context_tokens (最大上下文 Token) - 默认: 128000 - 说明: 上下文的硬性上限。超过此值将强制移除最早的消息。 - -model_thresholds (模型特定阈值) - 默认: {} - 说明: 针对特定模型的阈值覆盖配置。 - 示例: {"gpt-4": {"compression_threshold_tokens": 8000, "max_context_tokens": 32000}} - -keep_first (保留初始消息数) - 默认: 1 - 说明: 始终保留对话开始的 N 条消息。设置为 0 则不保留。第一条消息通常包含重要的提示或环境变量。 - -keep_last (保留最近消息数) - 默认: 6 - 说明: 始终保留对话末尾的 N 条完整消息,以确保上下文的连贯性。 - -summary_model (摘要模型) - 默认: None - 说明: 用于生成摘要的 LLM 模型。 - 建议: - - 强烈建议配置一个快速且经济的兼容模型,如 `deepseek-v3`、`gemini-2.5-flash`、`gpt-4.1`。 - - 如果留空,过滤器将尝试使用当前对话的模型。 - 注意: - - 如果当前对话使用的是流水线(Pipe)模型或不直接支持标准生成API的模型,留空此项可能会导致摘要生成失败。在这种情况下,必须指定一个有效的模型。 - -max_summary_tokens (摘要长度) - 默认: 16384 - 说明: 生成摘要时允许的最大 token 数。 - -summary_temperature (摘要温度) - 默认: 0.3 - 说明: 控制摘要生成的随机性,较低的值会产生更确定性的输出。 - -debug_mode (调试模式) - 默认: true - 说明: 在日志中打印详细的调试信息。生产环境建议设为 `false`。 - -show_debug_log (前端调试日志) - 默认: false - 说明: 在浏览器控制台打印调试日志 (F12)。便于前端调试。 - -🔧 部署配置 -═══════════════════════════════════════════════════════ - -插件自动使用 Open WebUI 的共享数据库连接。 -无需额外的数据库配置。 - -过滤器安装顺序建议: -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ -建议将此过滤器的优先级设置得相对较高(数值较小),以确保它在其他可能修改消息内容的过滤器之前运行。一个典型的顺序可能是: - - 1. 需要访问完整、未压缩历史记录的过滤器 (priority < 10) - (例如: 注入系统级提示的过滤器) - 2. 本压缩过滤器 (priority = 10) - 3. 在压缩后运行的过滤器 (priority > 10) - (例如: 最终输出格式化过滤器) - -═══════════════════════════════════════════════════════════════════════════════ -📝 数据库查询示例 -═══════════════════════════════════════════════════════════════════════════════ - -查看所有摘要: - SELECT - chat_id, - LEFT(summary, 100) as summary_preview, - compressed_message_count, - updated_at - FROM chat_summary - ORDER BY updated_at DESC; - -查询特定对话: - SELECT * - FROM chat_summary - WHERE chat_id = 'your_chat_id'; - -删除过期摘要: - DELETE FROM chat_summary - WHERE updated_at < NOW() - INTERVAL '30 days'; - -统计信息: - SELECT - COUNT(*) as total_summaries, - AVG(LENGTH(summary)) as avg_summary_length, - AVG(compressed_message_count) as avg_msg_count - FROM chat_summary; - -═══════════════════════════════════════════════════════════════════════════════ -⚠️ 注意事项 -═══════════════════════════════════════════════════════════════════════════════ - -1. 数据库连接 - ✓ 插件自动使用 Open WebUI 的共享数据库连接。 - ✓ 无需额外配置。 - ✓ 首次运行会自动创建 `chat_summary` 表。 - -2. 保留策略 - ⚠ `keep_first` 配置对于保留包含提示或环境变量的初始消息非常重要。请根据需要进行配置。 - -3. 性能考虑 - ⚠ 摘要生成是异步的,不会阻塞用户响应。 - ⚠ 首次达到阈值时会有短暂的后台处理时间。 - -4. 成本优化 - ⚠ 每次达到阈值会调用一次摘要模型。 - ⚠ 合理设置 `compression_threshold_tokens` 避免频繁调用。 - ⚠ 建议使用快速且经济的模型(如 `gemini-flash`)生成摘要。 - -5. 多模态支持 - ✓ 本过滤器支持包含图片的多模态消息。 - ✓ 摘要仅针对文本内容生成。 - ✓ 在压缩过程中,非文本部分(如图片)会被保留在原始消息中。 - -═══════════════════════════════════════════════════════════════════════════════ -🐛 故障排除 -═══════════════════════════════════════════════════════════════════════════════ - -问题:数据库表未创建 -解决: - 1. 确保 Open WebUI 已正确配置数据库。 - 2. 查看 Open WebUI 的容器日志以获取详细的错误信息。 - 3. 验证 Open WebUI 的数据库连接是否正常工作。 - -问题:摘要未生成 -解决: - 1. 检查是否达到 `compression_threshold_tokens`。 - 2. 查看 `summary_model` 是否配置正确。 - 3. 检查调试日志中的错误信息。 - -问题:初始的提示或环境变量丢失 -解决: - - 确保 `keep_first` 设置为大于 0 的值,以保留包含这些信息的初始消息。 - -问题:压缩效果不明显 -解决: - 1. 适当提高 `compression_threshold_tokens`。 - 2. 减少 `keep_last` 或 `keep_first` 的数量。 - 3. 检查对话是否真的很长。 - - -""" - -from pydantic import BaseModel, Field, model_validator -from typing import Optional, Dict, Any, List, Union, Callable, Awaitable -import re -import asyncio -import json -import hashlib -import contextlib -import logging - -# 配置日志记录 -logger = logging.getLogger(__name__) -if not logger.handlers: - handler = logging.StreamHandler() - formatter = logging.Formatter( - "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - ) - handler.setFormatter(formatter) - logger.addHandler(handler) -logger.setLevel(logging.INFO) - -# Open WebUI 内置导入 -from open_webui.utils.chat import generate_chat_completion -from open_webui.models.users import Users -from open_webui.models.models import Models -from fastapi.requests import Request -from open_webui.main import app as webui_app - -# Open WebUI 内部数据库 (复用共享连接) -try: - from open_webui.internal import db as owui_db -except ModuleNotFoundError: # pragma: no cover - filter runs inside Open WebUI - owui_db = None - -# 尝试导入 tiktoken -try: - import tiktoken -except ImportError: - tiktoken = None - -# 数据库导入 -from sqlalchemy import Column, String, Text, DateTime, Integer, inspect -from sqlalchemy.orm import declarative_base, sessionmaker -from sqlalchemy.engine import Engine -from datetime import datetime, timezone - - -def _discover_owui_engine(db_module: Any) -> Optional[Engine]: - """Discover the Open WebUI SQLAlchemy engine via provided db module helpers.""" - if db_module is None: - return None - - db_context = getattr(db_module, "get_db_context", None) or getattr( - db_module, "get_db", None - ) - if callable(db_context): - try: - with db_context() as session: - try: - return session.get_bind() - except AttributeError: - return getattr(session, "bind", None) or getattr( - session, "engine", None - ) - except Exception as exc: - print(f"[DB Discover] get_db_context failed: {exc}") - - for attr in ("engine", "ENGINE", "bind", "BIND"): - candidate = getattr(db_module, attr, None) - if candidate is not None: - return candidate - - return None - - -def _discover_owui_schema(db_module: Any) -> Optional[str]: - """Discover the Open WebUI database schema name if configured.""" - if db_module is None: - return None - - try: - base = getattr(db_module, "Base", None) - metadata = getattr(base, "metadata", None) if base is not None else None - candidate = getattr(metadata, "schema", None) if metadata is not None else None - if isinstance(candidate, str) and candidate.strip(): - return candidate.strip() - except Exception as exc: - print(f"[DB Discover] Base metadata schema lookup failed: {exc}") - - try: - metadata_obj = getattr(db_module, "metadata_obj", None) - candidate = ( - getattr(metadata_obj, "schema", None) if metadata_obj is not None else None - ) - if isinstance(candidate, str) and candidate.strip(): - return candidate.strip() - except Exception as exc: - print(f"[DB Discover] metadata_obj schema lookup failed: {exc}") - - try: - from open_webui import env as owui_env - - candidate = getattr(owui_env, "DATABASE_SCHEMA", None) - if isinstance(candidate, str) and candidate.strip(): - return candidate.strip() - except Exception as exc: - print(f"[DB Discover] env schema lookup failed: {exc}") - - return None - - -owui_engine = _discover_owui_engine(owui_db) -owui_schema = _discover_owui_schema(owui_db) -owui_Base = getattr(owui_db, "Base", None) if owui_db is not None else None -if owui_Base is None: - owui_Base = declarative_base() - - -class ChatSummary(owui_Base): - """对话摘要存储表""" - - __tablename__ = "chat_summary" - __table_args__ = ( - {"extend_existing": True, "schema": owui_schema} - if owui_schema - else {"extend_existing": True} - ) - - id = Column(Integer, primary_key=True, autoincrement=True) - chat_id = Column(String(255), unique=True, nullable=False, index=True) - summary = Column(Text, nullable=False) - compressed_message_count = Column(Integer, default=0) - created_at = Column(DateTime, default=lambda: datetime.now(timezone.utc)) - updated_at = Column( - DateTime, - default=lambda: datetime.now(timezone.utc), - onupdate=lambda: datetime.now(timezone.utc), - ) - - -class Filter: - def __init__(self): - self.valves = self.Valves() - self._owui_db = owui_db - self._db_engine = owui_engine - self._fallback_session_factory = ( - sessionmaker(bind=self._db_engine) if self._db_engine else None - ) - self._threshold_cache = {} - self._init_database() - - @contextlib.contextmanager - def _db_session(self): - """Yield a database session using Open WebUI helpers with graceful fallbacks.""" - db_module = self._owui_db - db_context = None - if db_module is not None: - db_context = getattr(db_module, "get_db_context", None) or getattr( - db_module, "get_db", None - ) - - if callable(db_context): - with db_context() as session: - yield session - return - - factory = None - if db_module is not None: - factory = getattr(db_module, "SessionLocal", None) or getattr( - db_module, "ScopedSession", None - ) - if callable(factory): - session = factory() - try: - yield session - finally: - close = getattr(session, "close", None) - if callable(close): - close() - return - - if self._fallback_session_factory is None: - raise RuntimeError( - "Open WebUI database session is unavailable. Ensure Open WebUI's database layer is initialized." - ) - - session = self._fallback_session_factory() - try: - yield session - finally: - try: - session.close() - except Exception as exc: # pragma: no cover - best-effort cleanup - print(f"[Database] ⚠️ Failed to close fallback session: {exc}") - - def _init_database(self): - """使用 Open WebUI 的共享连接初始化数据库表""" - try: - if self._db_engine is None: - raise RuntimeError( - "Open WebUI database engine is unavailable. Ensure Open WebUI is configured with a valid DATABASE_URL." - ) - - # 使用 SQLAlchemy inspect 检查表是否存在 - inspector = inspect(self._db_engine) - if not inspector.has_table("chat_summary"): - # 如果表不存在则创建 - ChatSummary.__table__.create(bind=self._db_engine, checkfirst=True) - print( - "[数据库] ✅ 使用 Open WebUI 的共享数据库连接成功创建 chat_summary 表。" - ) - else: - print( - "[数据库] ✅ 使用 Open WebUI 的共享数据库连接。chat_summary 表已存在。" - ) - - except Exception as e: - print(f"[数据库] ❌ 初始化失败: {str(e)}") - - class Valves(BaseModel): - priority: int = Field( - default=10, description="Priority level for the filter operations." - ) - # Token 相关参数 - compression_threshold_tokens: int = Field( - default=64000, - ge=0, - description="当上下文总 Token 数超过此值时,触发压缩 (全局默认值)", - ) - max_context_tokens: int = Field( - default=128000, - ge=0, - description="上下文的硬性上限。超过此值将强制移除最早的消息 (全局默认值)", - ) - model_thresholds: Union[str, dict] = Field( - default={}, - description="针对特定模型的阈值覆盖配置。可以是 JSON 字符串或字典。", - ) - - @model_validator(mode="before") - @classmethod - def parse_model_thresholds(cls, data: Any) -> Any: - if isinstance(data, dict): - thresholds = data.get("model_thresholds") - if isinstance(thresholds, str) and thresholds.strip(): - try: - data["model_thresholds"] = json.loads(thresholds) - except Exception as e: - logger.error(f"Failed to parse model_thresholds JSON: {e}") - return data - - keep_first: int = Field( - default=1, ge=0, description="始终保留最初的 N 条消息。设置为 0 则不保留。" - ) - keep_last: int = Field( - default=6, ge=0, description="始终保留最近的 N 条完整消息。" - ) - summary_model: Optional[str] = Field( - default=None, - description="用于生成摘要的模型 ID。留空则使用当前对话的模型。用于匹配 model_thresholds 中的配置。", - ) - summary_model_max_context: int = Field( - default=0, - ge=0, - description="摘要模型的最大上下文 Token 数。如果为 0,则回退到 model_thresholds 或全局 max_context_tokens。", - ) - max_summary_tokens: int = Field( - default=16384, ge=1, description="摘要的最大 token 数" - ) - summary_temperature: float = Field( - default=0.1, ge=0.0, le=2.0, description="摘要生成的温度参数" - ) - debug_mode: bool = Field(default=True, description="调试模式,打印详细日志") - show_debug_log: bool = Field( - default=False, description="在浏览器控制台打印调试日志 (F12)" - ) - show_token_usage_status: bool = Field( - default=True, description="在对话结束时显示 Token 使用情况的状态通知" - ) - enable_tool_output_trimming: bool = Field( - default=False, - description="启用原生工具输出裁剪 (仅适用于 native function calling),裁剪过长的工具输出以节省 Token。", - ) - - def _save_summary(self, chat_id: str, summary: str, compressed_count: int): - """保存摘要到数据库""" - try: - with self._db_session() as session: - # 查找现有记录 - existing = session.query(ChatSummary).filter_by(chat_id=chat_id).first() - - if existing: - # [优化] 乐观锁检查:只有进度向前推进时才更新 - if compressed_count <= existing.compressed_message_count: - if self.valves.debug_mode: - logger.debug( - f"[存储] 跳过更新:新进度 ({compressed_count}) 不大于现有进度 ({existing.compressed_message_count})" - ) - return - - # 更新现有记录 - existing.summary = summary - existing.compressed_message_count = compressed_count - existing.updated_at = datetime.now(timezone.utc) - else: - # 创建新记录 - new_summary = ChatSummary( - chat_id=chat_id, - summary=summary, - compressed_message_count=compressed_count, - ) - session.add(new_summary) - - session.commit() - - if self.valves.debug_mode: - action = "更新" if existing else "创建" - logger.info(f"[存储] 摘要已{action}到数据库 (Chat ID: {chat_id})") - - except Exception as e: - logger.error(f"[存储] ❌ 数据库保存失败: {str(e)}") - - def _load_summary_record(self, chat_id: str) -> Optional[ChatSummary]: - """从数据库加载摘要记录对象""" - try: - with self._db_session() as session: - record = session.query(ChatSummary).filter_by(chat_id=chat_id).first() - if record: - # Detach the object from the session so it can be used after session close - session.expunge(record) - return record - except Exception as e: - logger.error(f"[加载] ❌ 数据库读取失败: {str(e)}") - return None - - def _load_summary(self, chat_id: str, body: dict) -> Optional[str]: - """从数据库加载摘要文本 (兼容旧接口)""" - record = self._load_summary_record(chat_id) - if record: - if self.valves.debug_mode: - logger.debug(f"[加载] 从数据库加载摘要 (Chat ID: {chat_id})") - logger.debug( - f"[加载] 更新时间: {record.updated_at}, 已压缩消息数: {record.compressed_message_count}" - ) - return record.summary - return None - - def _count_tokens(self, text: str) -> int: - """计算文本的 Token 数量""" - if not text: - return 0 - - if tiktoken: - try: - # 统一使用 o200k_base 编码 (适配最新模型) - encoding = tiktoken.get_encoding("o200k_base") - return len(encoding.encode(text)) - except Exception as e: - if self.valves.debug_mode: - print(f"[Token计数] tiktoken 错误: {e},回退到字符估算") - - # 回退策略:粗略估算 (1 token ≈ 4 chars) - return len(text) // 4 - - def _calculate_messages_tokens(self, messages: List[Dict]) -> int: - """计算消息列表的总 Token 数""" - total_tokens = 0 - for msg in messages: - content = msg.get("content", "") - if isinstance(content, list): - # 多模态内容处理 - text_content = "" - for part in content: - if isinstance(part, dict) and part.get("type") == "text": - text_content += part.get("text", "") - total_tokens += self._count_tokens(text_content) - else: - total_tokens += self._count_tokens(str(content)) - return total_tokens - - def _get_model_thresholds(self, model_id: str) -> Dict[str, int]: - """获取特定模型的阈值配置 - - 优先级: - 1. 缓存匹配 - 2. model_thresholds 直接匹配 - 3. 基础模型 (base_model_id) 匹配 - 4. 全局默认配置 - """ - if not model_id: - return { - "compression_threshold_tokens": self.valves.compression_threshold_tokens, - "max_context_tokens": self.valves.max_context_tokens, - } - - # 1. 检查缓存 - if model_id in self._threshold_cache: - return self._threshold_cache[model_id] - - # 获取解析后的阈值配置 - parsed = self.valves.model_thresholds - if isinstance(parsed, str): - try: - parsed = json.loads(parsed) - except Exception: - parsed = {} - - # 2. 尝试直接匹配 - if model_id in parsed: - res = parsed[model_id] - self._threshold_cache[model_id] = res - if self.valves.debug_mode: - logger.debug(f"[配置] 模型 {model_id} 命中直接配置") - return res - - # 3. 尝试匹配基础模型 (base_model_id) - try: - model_obj = Models.get_model_by_id(model_id) - if model_obj: - # 某些模型可能有多个基础模型 ID - base_ids = [] - if hasattr(model_obj, "base_model_id") and model_obj.base_model_id: - base_ids.append(model_obj.base_model_id) - if hasattr(model_obj, "base_model_ids") and model_obj.base_model_ids: - if isinstance(model_obj.base_model_ids, list): - base_ids.extend(model_obj.base_model_ids) - - for b_id in base_ids: - if b_id in parsed: - res = parsed[b_id] - self._threshold_cache[model_id] = res - if self.valves.debug_mode: - logger.info( - f"[配置] 模型 {model_id} 匹配到基础模型 {b_id} 的配置" - ) - return res - except Exception as e: - logger.error(f"[配置] 查找基础模型失败: {e}") - - # 4. 使用全局默认配置 - res = { - "compression_threshold_tokens": self.valves.compression_threshold_tokens, - "max_context_tokens": self.valves.max_context_tokens, - } - self._threshold_cache[model_id] = res - return res - - def _get_chat_context( - self, body: dict, __metadata__: Optional[dict] = None - ) -> Dict[str, str]: - """ - 统一提取聊天上下文信息 (chat_id, message_id)。 - 优先从 body 中提取,其次从 metadata 中提取。 - """ - chat_id = "" - message_id = "" - - # 1. 尝试从 body 获取 - if isinstance(body, dict): - chat_id = body.get("chat_id", "") - message_id = body.get("id", "") # message_id 在 body 中通常是 id - - # 再次检查 body.metadata - if not chat_id or not message_id: - body_metadata = body.get("metadata", {}) - if isinstance(body_metadata, dict): - if not chat_id: - chat_id = body_metadata.get("chat_id", "") - if not message_id: - message_id = body_metadata.get("message_id", "") - - # 2. 尝试从 __metadata__ 获取 (作为补充) - if __metadata__ and isinstance(__metadata__, dict): - if not chat_id: - chat_id = __metadata__.get("chat_id", "") - if not message_id: - message_id = __metadata__.get("message_id", "") - - return { - "chat_id": str(chat_id).strip(), - "message_id": str(message_id).strip(), - } - - async def _emit_debug_log( - self, - __event_call__, - chat_id: str, - original_count: int, - compressed_count: int, - summary_length: int, - kept_first: int, - kept_last: int, - ): - """Emit debug log to browser console via JS execution""" - if not self.valves.show_debug_log or not __event_call__: - return - - try: - # Prepare data for JS - log_data = { - "chatId": chat_id, - "originalCount": original_count, - "compressedCount": compressed_count, - "summaryLength": summary_length, - "keptFirst": kept_first, - "keptLast": kept_last, - "ratio": ( - f"{(1 - compressed_count/original_count)*100:.1f}%" - if original_count > 0 - else "0%" - ), - } - - # Construct JS code - js_code = f""" - (async function() {{ - console.group("🗜️ Async Context Compression Debug"); - console.log("Chat ID:", {json.dumps(chat_id)}); - console.log("Messages:", {original_count} + " -> " + {compressed_count}); - console.log("Compression Ratio:", {json.dumps(log_data['ratio'])}); - console.log("Summary Length:", {summary_length} + " chars"); - console.log("Configuration:", {{ - "Keep First": {kept_first}, - "Keep Last": {kept_last} - }}); - console.groupEnd(); - }})(); - """ - - await __event_call__( - { - "type": "execute", - "data": {"code": js_code}, - } - ) - except Exception as e: - print(f"Error emitting debug log: {e}") - - async def _log(self, message: str, log_type: str = "info", event_call=None): - """统一日志输出到后端 (print) 和前端 (console.log)""" - # 后端日志 - if self.valves.debug_mode: - print(message) - - # 前端日志 - if self.valves.show_debug_log and event_call: - try: - css = "color: #3b82f6;" # 默认蓝色 - if log_type == "error": - css = "color: #ef4444; font-weight: bold;" # 红色 - elif log_type == "warning": - css = "color: #f59e0b;" # 橙色 - elif log_type == "success": - css = "color: #10b981; font-weight: bold;" # 绿色 - - # 清理前端消息:移除分隔符和多余换行 - lines = message.split("\n") - # 保留不以大量等号或连字符开头的行 - filtered_lines = [ - line - for line in lines - if not line.strip().startswith("====") - and not line.strip().startswith("----") - ] - clean_message = "\n".join(filtered_lines).strip() - - if not clean_message: - return - - # 转义消息中的引号和换行符 - safe_message = clean_message.replace('"', '\\"').replace("\n", "\\n") - - js_code = f""" - console.log("%c[压缩] {safe_message}", "{css}"); - """ - await event_call({"type": "execute", "data": {"code": js_code}}) - except Exception as e: - logger.error(f"发送前端日志失败: {e}") - - async def inlet( - self, - body: dict, - __user__: Optional[dict] = None, - __metadata__: dict = None, - __request__: Request = None, - __model__: dict = None, - __event_emitter__: Callable[[Any], Awaitable[None]] = None, - __event_call__: Callable[[Any], Awaitable[None]] = None, - ) -> dict: - """ - 在发送到 LLM 之前执行 - 压缩策略: - 1. 注入已有摘要 - 2. 预检 Token 预算 - 3. 如果超限,执行结构化裁剪(Structure-Aware Trimming)或丢弃旧消息 - """ - messages = body.get("messages", []) - - # --- 原生工具输出裁剪 (Native Tool Output Trimming) --- - metadata = body.get("metadata", {}) - is_native_func_calling = metadata.get("function_calling") == "native" - - if self.valves.enable_tool_output_trimming and is_native_func_calling: - trimmed_count = 0 - for msg in messages: - content = msg.get("content", "") - if not isinstance(content, str): - continue - - role = msg.get("role") - - # 仅处理带有原生工具输出的助手消息 - if role == "assistant": - # 检测助手内容中的工具输出标记 - if "tool_call_id:" in content or ( - content.startswith('"') and "\\"" in content - ): - if self.valves.show_debug_log and __event_call__: - await self._log( - f"[Inlet] 🔍 检测到助手消息中的原生工具输出。", - event_call=__event_call__, - ) - - # 提取最终答案(在最后一个工具调用元数据之后) - # 模式:匹配转义的 JSON 字符串,如 """...""" 后跟换行符 - # 我们寻找该模式的最后一次出现,并获取其后的所有内容 - - # 1. 尝试匹配特定的 OpenWebUI 工具输出格式:"""...""" - tool_output_pattern = r'""".*?"""\s*' - - # 查找所有匹配项 - matches = list( - re.finditer(tool_output_pattern, content, re.DOTALL) - ) - - if matches: - # 获取最后一个匹配项的结束位置 - last_match_end = matches[-1].end() - - # 最后一个工具输出之后的所有内容即为最终答案 - final_answer = content[last_match_end:].strip() - - if final_answer: - msg["content"] = ( - f"... [Tool outputs trimmed]\n{final_answer}" - ) - trimmed_count += 1 - else: - # 回退:如果找不到新格式,尝试按 "Arguments:" 分割 - # (保留向后兼容性或适应不同模型行为) - parts = re.split(r"(?:Arguments:\s*\{[^}]+\})\n+", content) - if len(parts) > 1: - final_answer = parts[-1].strip() - if final_answer: - msg["content"] = ( - f"... [Tool outputs trimmed]\n{final_answer}" - ) - trimmed_count += 1 - - if trimmed_count > 0 and self.valves.show_debug_log and __event_call__: - await self._log( - f"[Inlet] ✂️ 已裁剪 {trimmed_count} 条工具输出消息。", - event_call=__event_call__, - ) - - chat_ctx = self._get_chat_context(body, __metadata__) - chat_id = chat_ctx["chat_id"] - - # 提取系统提示词以进行准确的 Token 计算 - # 1. 对于自定义模型:检查数据库 (Models.get_model_by_id) - # 2. 对于基础模型:检查消息中的 role='system' - system_prompt_content = None - - # 尝试从数据库获取 (自定义模型) - try: - model_id = body.get("model") - if model_id: - if self.valves.show_debug_log and __event_call__: - await self._log( - f"[Inlet] 🔍 尝试从数据库查找模型: {model_id}", - event_call=__event_call__, - ) - - # 清理模型 ID - model_obj = Models.get_model_by_id(model_id) - - if model_obj: - if self.valves.show_debug_log and __event_call__: - await self._log( - f"[Inlet] ✅ 数据库中找到模型: {model_obj.name} (ID: {model_obj.id})", - event_call=__event_call__, - ) - - if model_obj.params: - try: - params = model_obj.params - # 处理 params 是 JSON 字符串的情况 - if isinstance(params, str): - params = json.loads(params) - # 转换 Pydantic 模型为字典 - elif hasattr(params, "model_dump"): - params = params.model_dump() - elif hasattr(params, "dict"): - params = params.dict() - - # 处理字典 - if isinstance(params, dict): - system_prompt_content = params.get("system") - else: - # 回退:尝试 getattr - system_prompt_content = getattr(params, "system", None) - - if system_prompt_content: - if self.valves.show_debug_log and __event_call__: - await self._log( - f"[Inlet] 📝 在数据库参数中找到系统提示词 ({len(system_prompt_content)} 字符)", - event_call=__event_call__, - ) - else: - if self.valves.show_debug_log and __event_call__: - await self._log( - f"[Inlet] ⚠️ 模型参数中缺少 'system' 键", - event_call=__event_call__, - ) - except Exception as e: - if self.valves.show_debug_log and __event_call__: - await self._log( - f"[Inlet] ❌ 解析模型参数失败: {e}", - log_type="error", - event_call=__event_call__, - ) - - else: - if self.valves.show_debug_log and __event_call__: - await self._log( - f"[Inlet] ⚠️ 模型参数为空", - event_call=__event_call__, - ) - else: - if self.valves.show_debug_log and __event_call__: - await self._log( - f"[Inlet] ❌ 数据库中未找到模型", - log_type="warning", - event_call=__event_call__, - ) - - except Exception as e: - if self.valves.show_debug_log and __event_call__: - await self._log( - f"[Inlet] ❌ 从数据库获取系统提示词错误: {e}", - log_type="error", - event_call=__event_call__, - ) - if self.valves.debug_mode: - logger.error(f"[Inlet] 从数据库获取系统提示词错误: {e}") - - # 回退:检查消息列表 (基础模型或已包含) - if not system_prompt_content: - for msg in messages: - if msg.get("role") == "system": - system_prompt_content = msg.get("content", "") - break - - # 构建 system_prompt_msg 用于 Token 计算 - system_prompt_msg = None - if system_prompt_content: - system_prompt_msg = {"role": "system", "content": system_prompt_content} - if self.valves.debug_mode: - logger.debug( - f"[Inlet] 找到系统提示词 ({len(system_prompt_content)} 字符)。计入预算。" - ) - - # 记录消息统计信息 (移至此处以包含提取的系统提示词) - if self.valves.show_debug_log and __event_call__: - try: - msg_stats = { - "user": 0, - "assistant": 0, - "system": 0, - "total": len(messages), - } - for msg in messages: - role = msg.get("role", "unknown") - if role in msg_stats: - msg_stats[role] += 1 - - # 如果系统提示词是从 DB/Model 提取的但不在消息中,则计数 - if system_prompt_content: - # 检查是否已计数 (即是否在消息中) - is_in_messages = any(m.get("role") == "system" for m in messages) - if not is_in_messages: - msg_stats["system"] += 1 - msg_stats["total"] += 1 - - stats_str = f"Total: {msg_stats['total']} | User: {msg_stats['user']} | Assistant: {msg_stats['assistant']} | System: {msg_stats['system']}" - await self._log( - f"[Inlet] 消息统计: {stats_str}", event_call=__event_call__ - ) - except Exception as e: - logger.error(f"[Inlet] 记录消息统计错误: {e}") - - if not chat_id: - await self._log( - "[Inlet] ❌ metadata 中缺少 chat_id,跳过压缩", - log_type="error", - event_call=__event_call__, - ) - return body - - if self.valves.debug_mode or self.valves.show_debug_log: - await self._log( - f"\n{'='*60}\n[Inlet] Chat ID: {chat_id}\n[Inlet] 收到 {len(messages)} 条消息", - event_call=__event_call__, - ) - - # 记录原始消息的目标压缩进度,供 outlet 使用 - # 目标是压缩到倒数第 keep_last 条之前 - target_compressed_count = max(0, len(messages) - self.valves.keep_last) - - await self._log( - f"[Inlet] 记录目标压缩进度: {target_compressed_count}", - event_call=__event_call__, - ) - - # 加载摘要记录 - summary_record = await asyncio.to_thread(self._load_summary_record, chat_id) - - # 计算 effective_keep_first 以确保所有系统消息都被保护 - last_system_index = -1 - for i, msg in enumerate(messages): - if msg.get("role") == "system": - last_system_index = i - - effective_keep_first = max(self.valves.keep_first, last_system_index + 1) - - final_messages = [] - - if summary_record: - # 存在摘要,构建视图:[Head] + [Summary Message] + [Tail] - # Tail 是从上次压缩点之后的所有消息 - compressed_count = summary_record.compressed_message_count - - # 确保 compressed_count 合理 - if compressed_count > len(messages): - compressed_count = max(0, len(messages) - self.valves.keep_last) - - # 1. 头部消息 (Keep First) - head_messages = [] - if effective_keep_first > 0: - head_messages = messages[:effective_keep_first] - - # 2. 摘要消息 (作为 User 消息插入) - summary_content = ( - f"【系统提示:以下是历史对话的摘要,仅供参考上下文,请勿对摘要内容进行回复,直接回答后续的最新问题】\n\n" - f"{summary_record.summary}\n\n" - f"---\n" - f"以下是最近的对话:" - ) - summary_msg = {"role": "assistant", "content": summary_content} - - # 3. 尾部消息 (Tail) - 从上次压缩点开始的所有消息 - # 注意:这里必须确保不重复包含头部消息 - start_index = max(compressed_count, effective_keep_first) - tail_messages = messages[start_index:] - - if self.valves.show_debug_log and __event_call__: - tail_preview = [ - f"{i + start_index}: [{m.get('role')}] {m.get('content', '')[:30]}..." - for i, m in enumerate(tail_messages) - ] - await self._log( - f"[Inlet] 📜 尾部消息 (起始索引: {start_index}): {tail_preview}", - event_call=__event_call__, - ) - - # --- 预检检查与预算 (Preflight Check & Budgeting) --- - - # 组装候选消息 (用于输出) - candidate_messages = head_messages + [summary_msg] + tail_messages - - # 准备用于 Token 计算的消息 (如果缺少则包含系统提示词) - calc_messages = candidate_messages - if system_prompt_msg: - # 检查系统提示词是否已在 head_messages 中 - is_in_head = any(m.get("role") == "system" for m in head_messages) - if not is_in_head: - calc_messages = [system_prompt_msg] + candidate_messages - - # 获取最大上下文限制 - model = self._clean_model_id(body.get("model")) - thresholds = self._get_model_thresholds(model) or {} - max_context_tokens = thresholds.get( - "max_context_tokens", self.valves.max_context_tokens - ) - - # 计算总 Token - total_tokens = await asyncio.to_thread( - self._calculate_messages_tokens, calc_messages - ) - - # 预检检查日志 - await self._log( - f"[Inlet] 🔎 预检检查: {total_tokens}t / {max_context_tokens}t ({(total_tokens/max_context_tokens*100):.1f}%)", - event_call=__event_call__, - ) - - # 如果超出预算,缩减历史记录 (Keep Last) - if total_tokens > max_context_tokens: - await self._log( - f"[Inlet] ⚠️ 候选提示词 ({total_tokens} Tokens) 超过上限 ({max_context_tokens})。正在缩减历史记录...", - log_type="warning", - event_call=__event_call__, - ) - - # 动态从 tail_messages 的开头移除消息 - # 始终尝试保留至少最后一条消息 (通常是用户输入) - while total_tokens > max_context_tokens and len(tail_messages) > 1: - # 策略 1: 结构化助手消息裁剪 (Structure-Aware Assistant Trimming) - # 保留: 标题 (#), 第一行, 最后一行。折叠其余部分。 - target_msg = None - target_idx = -1 - - # 查找最旧的、较长且尚未裁剪的助手消息 - for i, msg in enumerate(tail_messages): - # 跳过最后一条消息 (通常是用户输入,保护它) - if i == len(tail_messages) - 1: - break - - if msg.get("role") == "assistant": - content = str(msg.get("content", "")) - is_trimmed = msg.get("metadata", {}).get( - "is_trimmed", False - ) - # 仅针对相当长 (> 200 字符) 的消息 - if len(content) > 200 and not is_trimmed: - target_msg = msg - target_idx = i - break - - # 如果找到合适的助手消息,应用结构化裁剪 - if target_msg: - content = str(target_msg.get("content", "")) - lines = content.split("\n") - kept_lines = [] - - # 逻辑: 保留标题, 第一行非空行, 最后一行非空行 - first_line_found = False - last_line_idx = -1 - - # 查找最后一行非空行的索引 - for idx in range(len(lines) - 1, -1, -1): - if lines[idx].strip(): - last_line_idx = idx - break - - for idx, line in enumerate(lines): - stripped = line.strip() - if not stripped: - continue - - # 保留标题 (H1-H6, 需要 # 后有空格) - if re.match(r"^#{1,6}\s+", stripped): - kept_lines.append(line) - continue - - # 保留第一行非空行 - if not first_line_found: - kept_lines.append(line) - first_line_found = True - # 如果后面还有内容,添加占位符 - if idx < last_line_idx: - kept_lines.append("\n... [Content collapsed] ...\n") - continue - - # 保留最后一行非空行 - if idx == last_line_idx: - kept_lines.append(line) - continue - - # 更新消息内容 - new_content = "\n".join(kept_lines) - - # 安全检查: 如果裁剪没有节省太多 (例如主要是标题),则强制丢弃 - if len(new_content) > len(content) * 0.8: - # 如果结构保留过于冗长,回退到丢弃 - pass - else: - target_msg["content"] = new_content - if "metadata" not in target_msg: - target_msg["metadata"] = {} - target_msg["metadata"]["is_trimmed"] = True - - # 计算 Token 减少量 - old_tokens = self._count_tokens(content) - new_tokens = self._count_tokens(target_msg["content"]) - diff = old_tokens - new_tokens - total_tokens -= diff - - if self.valves.show_debug_log and __event_call__: - await self._log( - f"[Inlet] 📉 结构化裁剪助手消息。节省: {diff} tokens。", - event_call=__event_call__, - ) - continue - - # 策略 2: 回退 - 完全丢弃最旧的消息 (FIFO) - dropped = tail_messages.pop(0) - dropped_tokens = self._count_tokens(str(dropped.get("content", ""))) - total_tokens -= dropped_tokens - - if self.valves.show_debug_log and __event_call__: - await self._log( - f"[Inlet] 🗑️ 从历史记录中丢弃消息以适应上下文。角色: {dropped.get('role')}, Tokens: {dropped_tokens}", - event_call=__event_call__, - ) - - # 重新组装 - candidate_messages = head_messages + [summary_msg] + tail_messages - - await self._log( - f"[Inlet] ✂️ 历史记录已缩减。新总数: {total_tokens} Tokens (尾部大小: {len(tail_messages)})", - event_call=__event_call__, - ) - - final_messages = candidate_messages - - # 计算详细 Token 统计以用于日志 - system_tokens = ( - self._count_tokens(system_prompt_msg.get("content", "")) - if system_prompt_msg - else 0 - ) - head_tokens = self._calculate_messages_tokens(head_messages) - summary_tokens = self._count_tokens(summary_content) - tail_tokens = self._calculate_messages_tokens(tail_messages) - - system_info = ( - f"System({system_tokens}t)" if system_prompt_msg else "System(0t)" - ) - - total_section_tokens = ( - system_tokens + head_tokens + summary_tokens + tail_tokens - ) - - await self._log( - f"[Inlet] 应用摘要: {system_info} + Head({len(head_messages)} 条, {head_tokens}t) + Summary({summary_tokens}t) + Tail({len(tail_messages)} 条, {tail_tokens}t) = Total({total_section_tokens}t)", - log_type="success", - event_call=__event_call__, - ) - - # 准备状态消息 (上下文使用量格式) - if max_context_tokens > 0: - usage_ratio = total_section_tokens / max_context_tokens - status_msg = f"上下文使用量 (预估): {total_section_tokens} / {max_context_tokens} Tokens ({usage_ratio*100:.1f}%)" - if usage_ratio > 0.9: - status_msg += " | ⚠️ 高负载" - else: - status_msg = f"已加载历史摘要 (隐藏 {compressed_count} 条历史消息)" - - if __event_emitter__: - await __event_emitter__( - { - "type": "status", - "data": { - "description": status_msg, - "done": True, - }, - } - ) - - # Emit debug log to frontend (Keep the structured log as well) - await self._emit_debug_log( - __event_call__, - chat_id, - len(messages), - len(final_messages), - len(summary_record.summary), - self.valves.keep_first, - self.valves.keep_last, - ) - else: - # 没有摘要,使用原始消息 - # 但仍然需要检查预算! - final_messages = messages - - # 包含系统提示词进行计算 - calc_messages = final_messages - if system_prompt_msg: - is_in_messages = any(m.get("role") == "system" for m in final_messages) - if not is_in_messages: - calc_messages = [system_prompt_msg] + final_messages - - # 获取最大上下文限制 - model = self._clean_model_id(body.get("model")) - thresholds = self._get_model_thresholds(model) or {} - max_context_tokens = thresholds.get( - "max_context_tokens", self.valves.max_context_tokens - ) - - total_tokens = await asyncio.to_thread( - self._calculate_messages_tokens, calc_messages - ) - - if total_tokens > max_context_tokens: - await self._log( - f"[Inlet] ⚠️ 原始消息 ({total_tokens} Tokens) 超过上限 ({max_context_tokens})。正在缩减历史记录...", - log_type="warning", - event_call=__event_call__, - ) - - # 动态从开头移除消息 - # 我们将遵守 effective_keep_first 以保护系统提示词 - - start_trim_index = effective_keep_first - - while ( - total_tokens > max_context_tokens - and len(final_messages) - > start_trim_index + 1 # 保留 keep_first 之后至少 1 条消息 - ): - dropped = final_messages.pop(start_trim_index) - dropped_tokens = self._count_tokens(str(dropped.get("content", ""))) - total_tokens -= dropped_tokens - - await self._log( - f"[Inlet] ✂️ 消息已缩减。新总数: {total_tokens} Tokens", - event_call=__event_call__, - ) - - # 发送状态通知 (上下文使用量格式) - if __event_emitter__: - status_msg = ( - f"上下文使用量 (预估): {total_tokens} / {max_context_tokens} Tokens" - ) - if max_context_tokens > 0: - usage_ratio = total_tokens / max_context_tokens - status_msg += f" ({usage_ratio*100:.1f}%)" - if usage_ratio > 0.9: - status_msg += " | ⚠️ 高负载" - - await __event_emitter__( - { - "type": "status", - "data": { - "description": status_msg, - "done": True, - }, - } - ) - - body["messages"] = final_messages - - await self._log( - f"[Inlet] 最终发送: {len(body['messages'])} 条消息\n{'='*60}\n", - event_call=__event_call__, - ) - - return body - - async def outlet( - self, - body: dict, - __user__: Optional[dict] = None, - __metadata__: dict = None, - __event_emitter__: Callable[[Any], Awaitable[None]] = None, - __event_call__: Callable[[Any], Awaitable[None]] = None, - ) -> dict: - """ - 在 LLM 响应完成后执行 - 在后台计算 Token 数并触发摘要生成(不阻塞当前响应,不影响内容输出) - """ - chat_ctx = self._get_chat_context(body, __metadata__) - chat_id = chat_ctx["chat_id"] - if not chat_id: - await self._log( - "[Outlet] ❌ metadata 中缺少 chat_id,跳过压缩", - log_type="error", - event_call=__event_call__, - ) - return body - model = body.get("model") or "" - messages = body.get("messages", []) - - # 直接计算目标压缩进度 - target_compressed_count = max(0, len(messages) - self.valves.keep_last) - - # 在后台异步处理 Token 计算和摘要生成(不等待完成,不影响输出) - asyncio.create_task( - self._check_and_generate_summary_async( - chat_id, - model, - body, - __user__, - target_compressed_count, - __event_emitter__, - __event_call__, - ) - ) - - return body - - async def _check_and_generate_summary_async( - self, - chat_id: str, - model: str, - body: dict, - user_data: Optional[dict], - target_compressed_count: Optional[int], - __event_emitter__: Callable[[Any], Awaitable[None]] = None, - __event_call__: Callable[[Any], Awaitable[None]] = None, - ): - """ - 后台处理:计算 Token 数并生成摘要(不阻塞响应) - """ - try: - messages = body.get("messages", []) - - # 获取当前模型的阈值配置 - thresholds = self._get_model_thresholds(model) or {} - compression_threshold_tokens = thresholds.get( - "compression_threshold_tokens", self.valves.compression_threshold_tokens - ) - - await self._log( - f"\n[🔍 后台计算] 开始 Token 计数...", - event_call=__event_call__, - ) - - # 在后台线程中计算 Token 数 - current_tokens = await asyncio.to_thread( - self._calculate_messages_tokens, messages - ) - - await self._log( - f"[🔍 后台计算] Token 数: {current_tokens}", - event_call=__event_call__, - ) - - # 检查是否需要压缩 - if current_tokens >= compression_threshold_tokens: - await self._log( - f"[🔍 后台计算] ⚡ 触发压缩阈值 (Token: {current_tokens} >= {compression_threshold_tokens})", - log_type="warning", - event_call=__event_call__, - ) - - # 继续生成摘要 - await self._generate_summary_async( - messages, - chat_id, - body, - user_data, - target_compressed_count, - __event_emitter__, - __event_call__, - ) - else: - await self._log( - f"[🔍 后台计算] 未触发压缩阈值 (Token: {current_tokens} < {compression_threshold_tokens})", - event_call=__event_call__, - ) - - except Exception as e: - await self._log( - f"[🔍 后台计算] ❌ 错误: {str(e)}", - log_type="error", - event_call=__event_call__, - ) - - def _clean_model_id(self, model_id: Optional[str]) -> Optional[str]: - """Cleans the model ID by removing whitespace and quotes.""" - if not model_id: - return None - cleaned = model_id.strip().strip('"').strip("'") - return cleaned if cleaned else None - - async def _generate_summary_async( - self, - messages: list, - chat_id: str, - body: dict, - user_data: Optional[dict], - target_compressed_count: Optional[int], - __event_emitter__: Callable[[Any], Awaitable[None]] = None, - __event_call__: Callable[[Any], Awaitable[None]] = None, - ): - """ - 异步生成摘要(后台执行,不阻塞响应) - 逻辑: - 1. 提取中间消息(去除 keep_first 和 keep_last)。 - 2. 检查 Token 上限,如果超过 max_context_tokens,从中间消息头部移除。 - 3. 对剩余的中间消息生成摘要。 - """ - try: - await self._log(f"\n[🤖 异步摘要任务] 开始...", event_call=__event_call__) - - # 1. 获取目标压缩进度 - # 如果未传递 target_compressed_count(新逻辑下不应发生),则进行估算 - if target_compressed_count is None: - target_compressed_count = max(0, len(messages) - self.valves.keep_last) - await self._log( - f"[🤖 异步摘要任务] ⚠️ target_compressed_count 为 None,进行估算: {target_compressed_count}", - log_type="warning", - event_call=__event_call__, - ) - - # 2. 确定待压缩的消息范围 (Middle) - start_index = self.valves.keep_first - end_index = len(messages) - self.valves.keep_last - if self.valves.keep_last == 0: - end_index = len(messages) - - # 确保索引有效 - if start_index >= end_index: - await self._log( - f"[🤖 异步摘要任务] 中间消息为空 (Start: {start_index}, End: {end_index}),跳过", - event_call=__event_call__, - ) - return - - middle_messages = messages[start_index:end_index] - tail_preview_msgs = messages[end_index:] - - if self.valves.show_debug_log and __event_call__: - middle_preview = [ - f"{i + start_index}: [{m.get('role')}] {m.get('content', '')[:20]}..." - for i, m in enumerate(middle_messages[:3]) - ] - tail_preview = [ - f"{i + end_index}: [{m.get('role')}] {m.get('content', '')[:20]}..." - for i, m in enumerate(tail_preview_msgs) - ] - await self._log( - f"[🤖 异步摘要任务] 📊 边界检查:\n" - f" - 中间 (压缩): {len(middle_messages)} 条 (索引 {start_index}-{end_index-1}) -> 预览: {middle_preview}\n" - f" - 尾部 (保留): {len(tail_preview_msgs)} 条 (索引 {end_index}-End) -> 预览: {tail_preview}", - event_call=__event_call__, - ) - - # 3. 检查 Token 上限并截断 (Max Context Truncation) - # [优化] 使用摘要模型(如果有)的阈值来决定能处理多少中间消息 - # 这样可以用长窗口模型(如 gemini-flash)来压缩超过当前模型窗口的历史记录 - summary_model_id = self._clean_model_id( - self.valves.summary_model - ) or self._clean_model_id(body.get("model")) - - if not summary_model_id: - await self._log( - "[🤖 异步摘要任务] ⚠️ 摘要模型不存在,跳过压缩", - log_type="warning", - event_call=__event_call__, - ) - return - - thresholds = self._get_model_thresholds(summary_model_id) or {} - # Priority: 1. summary_model_max_context (if > 0) -> 2. model_thresholds -> 3. global max_context_tokens - if self.valves.summary_model_max_context > 0: - max_context_tokens = self.valves.summary_model_max_context - else: - max_context_tokens = thresholds.get( - "max_context_tokens", self.valves.max_context_tokens - ) - - await self._log( - f"[🤖 异步摘要任务] 使用模型 {summary_model_id} 的上限: {max_context_tokens} Tokens", - event_call=__event_call__, - ) - - # 计算中间消息的 Token (加上提示词的缓冲) - # 我们只把 middle_messages 发送给摘要模型,所以不应该把完整历史计入限制 - middle_tokens = await asyncio.to_thread( - self._calculate_messages_tokens, middle_messages - ) - # 增加提示词和输出的缓冲 (约 2000 Tokens) - estimated_input_tokens = middle_tokens + 2000 - - if estimated_input_tokens > max_context_tokens: - excess_tokens = estimated_input_tokens - max_context_tokens - await self._log( - f"[🤖 异步摘要任务] ⚠️ 中间消息 ({middle_tokens} Tokens) + 缓冲超过摘要模型上限 ({max_context_tokens}),需要移除约 {excess_tokens} Token", - log_type="warning", - event_call=__event_call__, - ) - - # 从 middle_messages 头部开始移除 - removed_tokens = 0 - removed_count = 0 - - while removed_tokens < excess_tokens and middle_messages: - msg_to_remove = middle_messages.pop(0) - msg_tokens = self._count_tokens( - str(msg_to_remove.get("content", "")) - ) - removed_tokens += msg_tokens - removed_count += 1 - - await self._log( - f"[🤖 异步摘要任务] 已移除 {removed_count} 条消息,共 {removed_tokens} Token", - event_call=__event_call__, - ) - - if not middle_messages: - await self._log( - f"[🤖 异步摘要任务] 截断后中间消息为空,跳过摘要生成", - event_call=__event_call__, - ) - return - - # 4. 构建对话文本 - conversation_text = self._format_messages_for_summary(middle_messages) - - # 5. 调用 LLM 生成新摘要 - # 注意:这里不再传入 previous_summary,因为旧摘要(如果有)已经包含在 middle_messages 里了 - - # 发送开始生成摘要的状态通知 - if __event_emitter__: - await __event_emitter__( - { - "type": "status", - "data": { - "description": "正在后台生成上下文摘要...", - "done": False, - }, - } - ) - - new_summary = await self._call_summary_llm( - None, - conversation_text, - {**body, "model": summary_model_id}, - user_data, - __event_call__, - ) - - if not new_summary: - await self._log( - "[🤖 异步摘要任务] ⚠️ 摘要生成返回空结果,跳过保存", - log_type="warning", - event_call=__event_call__, - ) - return - - # 6. 保存新摘要 - await self._log( - "[优化] 在后台线程中保存摘要以避免阻塞事件循环。", - event_call=__event_call__, - ) - - await asyncio.to_thread( - self._save_summary, chat_id, new_summary, target_compressed_count - ) - - # 发送完成状态通知 - if __event_emitter__: - await __event_emitter__( - { - "type": "status", - "data": { - "description": f"上下文摘要已更新 (压缩了 {len(middle_messages)} 条消息)", - "done": True, - }, - } - ) - - await self._log( - f"[🤖 异步摘要任务] ✅ 完成!新摘要长度: {len(new_summary)} 字符", - log_type="success", - event_call=__event_call__, - ) - await self._log( - f"[🤖 异步摘要任务] 进度更新: 已压缩至原始消息 {target_compressed_count}", - event_call=__event_call__, - ) - - # --- Token 使用情况状态通知 --- - if self.valves.show_token_usage_status and __event_emitter__: - try: - # 1. 获取系统提示词 (DB 回退) - system_prompt_msg = None - model_id = body.get("model") - if model_id: - try: - model_obj = Models.get_model_by_id(model_id) - if model_obj and model_obj.params: - params = model_obj.params - if isinstance(params, str): - params = json.loads(params) - if isinstance(params, dict): - sys_content = params.get("system") - else: - sys_content = getattr(params, "system", None) - - if sys_content: - system_prompt_msg = { - "role": "system", - "content": sys_content, - } - except Exception: - pass # 忽略 DB 错误,尽力而为 - - # 2. 计算 Effective Keep First - last_system_index = -1 - for i, msg in enumerate(messages): - if msg.get("role") == "system": - last_system_index = i - effective_keep_first = max( - self.valves.keep_first, last_system_index + 1 - ) - - # 3. 构建下一个上下文 (Next Context) - # Head - head_msgs = ( - messages[:effective_keep_first] - if effective_keep_first > 0 - else [] - ) - - # Summary - summary_content = ( - f"【系统提示:以下是历史对话的摘要,仅供参考上下文,请勿对摘要内容进行回复,直接回答后续的最新问题】\n\n" - f"{new_summary}\n\n" - f"---\n" - f"以下是最近的对话:" - ) - summary_msg = {"role": "assistant", "content": summary_content} - - # Tail (使用 target_compressed_count,这是我们刚刚压缩到的位置) - # 注意:target_compressed_count 是要被摘要覆盖的消息数(不包括 keep_last) - # 所以 tail 从 max(target_compressed_count, effective_keep_first) 开始 - start_index = max(target_compressed_count, effective_keep_first) - tail_msgs = messages[start_index:] - - # 组装 - next_context = head_msgs + [summary_msg] + tail_msgs - - # 如果需要,注入系统提示词 - if system_prompt_msg: - is_in_head = any(m.get("role") == "system" for m in head_msgs) - if not is_in_head: - next_context = [system_prompt_msg] + next_context - - # 4. 计算 Token - token_count = self._calculate_messages_tokens(next_context) - - # 5. 获取阈值并计算比例 - model = self._clean_model_id(body.get("model")) - thresholds = self._get_model_thresholds(model) or {} - # Priority: 1. summary_model_max_context (if > 0) -> 2. model_thresholds -> 3. global max_context_tokens - if self.valves.summary_model_max_context > 0: - max_context_tokens = self.valves.summary_model_max_context - else: - max_context_tokens = thresholds.get( - "max_context_tokens", self.valves.max_context_tokens - ) - - # 6. 发送状态 - status_msg = ( - f"上下文摘要已更新: {token_count} / {max_context_tokens} Tokens" - ) - if max_context_tokens > 0: - ratio = (token_count / max_context_tokens) * 100 - status_msg += f" ({ratio:.1f}%)" - if ratio > 90.0: - status_msg += " | ⚠️ 高负载" - - await __event_emitter__( - { - "type": "status", - "data": { - "description": status_msg, - "done": True, - }, - } - ) - except Exception as e: - await self._log( - f"[Status] 计算 Token 错误: {e}", - log_type="error", - event_call=__event_call__, - ) - - except Exception as e: - await self._log( - f"[🤖 异步摘要任务] ❌ 错误: {str(e)}", - log_type="error", - event_call=__event_call__, - ) - - if __event_emitter__: - await __event_emitter__( - { - "type": "status", - "data": { - "description": f"摘要生成错误: {str(e)[:100]}...", - "done": True, - }, - } - ) - - logger.exception("[🤖 异步摘要任务] ❌ 发生异常") - - def _format_messages_for_summary(self, messages: list) -> str: - """Formats messages for summarization.""" - formatted = [] - for i, msg in enumerate(messages, 1): - role = msg.get("role", "unknown") - content = msg.get("content", "") - - # Handle multimodal content - if isinstance(content, list): - text_parts = [] - for part in content: - if isinstance(part, dict) and part.get("type") == "text": - text_parts.append(part.get("text", "")) - content = " ".join(text_parts) - - # Handle role name - role_name = {"user": "User", "assistant": "Assistant"}.get(role, role) - - # User requested to remove truncation to allow full context for summary - # unless it exceeds model limits (which is handled by the LLM call itself or max_tokens) - - formatted.append(f"[{i}] {role_name}: {content}") - - return "\n\n".join(formatted) - - async def _call_summary_llm( - self, - previous_summary: Optional[str], - new_conversation_text: str, - body: dict, - user_data: dict, - __event_call__: Callable[[Any], Awaitable[None]] = None, - ) -> str: - """ - 调用 LLM 生成摘要,使用 Open Web UI 的内置方法。 - """ - await self._log( - f"[🤖 LLM 调用] 使用 Open Web UI 内置方法", - event_call=__event_call__, - ) - - # 构建摘要提示词 (优化版) - summary_prompt = f""" -你是一个专业的对话上下文压缩专家。你的任务是对以下对话内容进行高保真摘要。 -这段对话可能包含之前的摘要(作为系统消息或文本)以及后续的对话内容。 - -### 核心目标 -1. **全面总结**:将对话中的关键信息、用户意图、助手回复进行精炼总结。 -2. **去噪提纯**:移除寒暄、重复、确认性回复等无用信息。 -3. **关键保留**: - * **代码片段、命令、技术参数必须逐字保留,严禁修改或概括。** - * 用户意图、核心需求、决策结论、待办事项必须清晰保留。 -4. **连贯性**:生成的摘要应作为一个整体,能够替代原始对话作为上下文。 -5. **详尽记录**:由于允许的篇幅较长,请尽可能保留对话中的细节、论证过程和多轮交互的细微差别,而不仅仅是宏观概括。 - -### 输出要求 -* **格式**:结构化文本,逻辑清晰。 -* **语言**:与对话语言保持一致(通常为中文)。 -* **长度**:严格控制在 {self.valves.max_summary_tokens} Token 以内。 -* **严禁**:不要输出"根据对话..."、"摘要如下..."等废话。直接输出摘要内容。 - -### 摘要结构建议 -* **当前目标/主题**:一句话概括当前正在解决的问题。 -* **关键信息与上下文**: - * 已确认的事实/参数。 - * **代码/技术细节** (使用代码块包裹)。 -* **进展与结论**:已完成的步骤和达成的共识。 -* **待办/下一步**:明确的后续行动。 - ---- -{new_conversation_text} ---- - -请根据上述内容,生成摘要: -""" - # 确定使用的模型 - model = self._clean_model_id(self.valves.summary_model) or self._clean_model_id( - body.get("model") - ) - - if not model: - await self._log( - "[🤖 LLM 调用] ⚠️ 摘要模型不存在,跳过摘要生成", - log_type="warning", - event_call=__event_call__, - ) - return "" - - await self._log(f"[🤖 LLM 调用] 模型: {model}", event_call=__event_call__) - - # 构建 payload - payload = { - "model": model, - "messages": [{"role": "user", "content": summary_prompt}], - "stream": False, - "max_tokens": self.valves.max_summary_tokens, - "temperature": self.valves.summary_temperature, - } - - try: - # 获取用户对象 - user_id = user_data.get("id") if user_data else None - if not user_id: - raise ValueError("无法获取用户 ID") - - # [优化] 在后台线程中获取用户对象以避免阻塞事件循环 - await self._log( - "[优化] 在后台线程中获取用户对象以避免阻塞事件循环。", - event_call=__event_call__, - ) - user = await asyncio.to_thread(Users.get_user_by_id, user_id) - - if not user: - raise ValueError(f"无法找到用户: {user_id}") - - await self._log( - f"[🤖 LLM 调用] 用户: {user.email}\n[🤖 LLM 调用] 发送请求...", - event_call=__event_call__, - ) - - # 创建 Request 对象 - request = Request(scope={"type": "http", "app": webui_app}) - - # 调用 generate_chat_completion - response = await generate_chat_completion(request, payload, user) - - # Handle JSONResponse (some backends return JSONResponse instead of dict) - if hasattr(response, "body"): - # It's a Response object, extract the body - import json as json_module - - try: - response = json_module.loads(response.body.decode("utf-8")) - except Exception: - raise ValueError(f"Failed to parse JSONResponse body: {response}") - - if ( - not response - or not isinstance(response, dict) - or "choices" not in response - or not response["choices"] - ): - raise ValueError( - f"LLM response format incorrect or empty: {type(response).__name__}" - ) - - summary = response["choices"][0]["message"]["content"].strip() - - await self._log( - f"[🤖 LLM 调用] ✅ 成功接收摘要", - log_type="success", - event_call=__event_call__, - ) - - return summary - - except Exception as e: - error_msg = str(e) - # Handle specific error messages - if "Model not found" in error_msg: - error_message = f"摘要模型 '{model}' 不存在。" - else: - error_message = f"摘要 LLM 错误 ({model}): {error_msg}" - if not self.valves.summary_model: - error_message += ( - "\n[提示] 您未指定 summary_model,因此过滤器尝试使用当前对话的模型。" - "如果这是流水线 (Pipe) 模型或不兼容的模型,请在配置中指定兼容的摘要模型 (例如 'gemini-2.5-flash')。" - ) - - await self._log( - f"[🤖 LLM 调用] ❌ {error_message}", - log_type="error", - event_call=__event_call__, - ) - - raise Exception(error_message)