feat(iod): 重构数联网搜索功能

- 新增数联网设置页面
- 优化数联网搜索结果展示
- 添加数据集、科创场景和科技企业等不同类型的搜索结果
- 重构搜索结果卡片组件,支持加载状态和不同展示模式
- 更新数联网搜索相关的国际化文案
This commit is contained in:
zhaoweijie
2025-08-22 17:15:19 +08:00
parent efbf2a3eff
commit 17020e8755
33 changed files with 1321 additions and 773 deletions

View File

@@ -1,296 +1,401 @@
import { cleanUrl } from "@/libs/clean-url"
import { PageAssistHtmlLoader } from "@/loader/html"
import { PageAssistPDFUrlLoader } from "@/loader/pdf-url"
import { pageAssistEmbeddingModel } from "@/models/embedding"
import { defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
import { getOllamaURL } from "@/services/ollama"
import {
getIsSimpleInternetSearch,
totalSearchResults
} from "@/services/search"
import { getPageAssistTextSplitter } from "@/utils/text-splitter"
import { Document } from "@langchain/core/documents"
import { MemoryVectorStore } from "langchain/vectorstores/memory"
import type { IodRegistryEntry } from "~/types/iod"
import { Document } from "@langchain/core/documents"
import { AllIodRegistryEntry, IodRegistryEntry } from "~/types/iod"
import { PageAssitDatabase } from "@/db"
import exp from "constants"
import { Segment, useDefault, cnPOSTag, enPOSTag} from 'segmentit';
const segment = useDefault(new Segment());
import { enPOSTag, Segment, useDefault } from "segmentit"
import { getDefaultIodSources } from "@/libs/iod.ts"
const segment = useDefault(new Segment())
export const tokenizeInput = function (input: string): string[] {
const words = segment.doSegment(input, { simple: false });
console.log(words.map(function(word){return {w:word.w, p:enPOSTag(word.p)}}) );
return words.filter(word =>( word.w.length > 1)).map(word=>word.w);
const words = segment.doSegment(input, { simple: false })
console.log(
words.map(function (word) {
return { w: word.w, p: enPOSTag(word.p) }
})
)
return words.filter((word) => word.w.length > 1).map((word) => word.w)
}
//doipUrl = tcp://reg01.public.internetofdata.cn:21037
export const iodConfig = {
"gatewayUrl": "tcp://reg01.public.internetofdata.cn:21037",
"registry":"data/Registry",
"localRepository":"data/Repository",
"doBrowser":"http://021.node.internetapi.cn:21030/SCIDE/SCManager"
export const _iodConfig = {
gatewayUrl: "tcp://reg01.public.internetofdata.cn:21037",
registry: "data/Registry",
localRepository: "data/Repository",
doBrowser: "http://021.node.internetapi.cn:21030/SCIDE/SCManager"
}
function getIodConfig() {
const val = localStorage.getItem("iod-connect")
if (!val) {
return _iodConfig
}
try {
return JSON.parse(val)
} catch {
return _iodConfig
}
}
export const iodConfigLocal = {
"gatewayUrl": "tcp://127.0.0.1:21036",
"registry":"bdware/Registry",
"localRepository":"bdtest.local/myrepo1",
"doBrowser":"http://127.0.0.1:21030/SCIDE/SCManager"
gatewayUrl: "tcp://127.0.0.1:21036",
registry: "bdware/Registry",
localRepository: "bdtest.local/myrepo1",
doBrowser: "http://127.0.0.1:21030/SCIDE/SCManager"
}
function inGrepList(str: string){
return "什么|问题|需要|合适|设计|考虑|合作|精度|传感器|最新|研究|药物".indexOf(str)!=-1;
function inGrepList(str: string) {
return (
"什么|问题|需要|合适|设计|考虑|合作|精度|传感器|最新|研究|药物".indexOf(
str
) != -1
)
}
export const makeSearchParamsWithDataType = function(count: number, keyword: string| string[], dataType: string){
const searchMode = [];
searchMode.push({"key":"data_type", "type":"MUST", "value":dataType})
if (typeof keyword === 'string') {
export const makeSearchParamsWithDataType = function (
count: number,
keyword: string | string[],
dataType: string
) {
const iodConfig = getIodConfig()
const searchMode = []
searchMode.push({ key: "data_type", type: "MUST", value: dataType })
if (typeof keyword === "string") {
// 如果 keyword 是字符串,则直接添加一个 searchMode 条目
searchMode.push({
key: "description",
type: "MUST",
value: keyword
});
})
} else if (Array.isArray(keyword)) {
// 如果 keyword 是数组,则为每个元素添加一个 searchMode 条目
keyword.forEach(str => {
keyword.forEach((str) => {
if (!inGrepList(str))
searchMode.push({
key: "description",
type: "SHOULD",
value: str
});
});
searchMode.push({
key: "description",
type: "SHOULD",
value: str
})
})
}
return {
action: "executeContract",
contractID: "BDBrowser",
operation: "sendRequestDirectly",
arg: {
id: iodConfig.registry,
id: iodConfig.registry,
//doipUrl:"tcp://127.0.0.1:21039",
doipUrl: iodConfig.gatewayUrl,
op: "Search",
vars:{
timeout:15000
vars: {
timeout: 15000
},
attributes: {
offset: 0,
count,
bodyBase64Encoded: false,
searchMode:searchMode
searchMode: searchMode
},
body: ""
}
}
}
export const makeRegSearchParams = function(count: number, keyword: string| string[]){
const searchMode = [];
if (typeof keyword === 'string') {
export const makeRegSearchParams = function (
count: number,
keyword: string | string[]
) {
const searchMode = []
const iodConfig = getIodConfig()
if (typeof keyword === "string") {
// 如果 keyword 是字符串,则直接添加一个 searchMode 条目
searchMode.push({
key: "description",
type: "MUST",
value: keyword
});
})
} else if (Array.isArray(keyword)) {
// 如果 keyword 是数组,则为每个元素添加一个 searchMode 条目
keyword.forEach(str => {
keyword.forEach((str) => {
if (!inGrepList(str))
searchMode.push({
key: "description",
type: "SHOULD",
value: str
});
});
searchMode.push({
key: "description",
type: "SHOULD",
value: str
})
})
}
return {
action: "executeContract",
contractID: "BDBrowser",
operation: "sendRequestDirectly",
arg: {
id: iodConfig.registry,
id: iodConfig.registry,
//doipUrl:"tcp://127.0.0.1:21039",
doipUrl: iodConfig.gatewayUrl,
op: "Search",
vars:{
timeout:15000
vars: {
timeout: 15000
},
attributes: {
offset: 0,
count,
bodyBase64Encoded: false,
searchMode:searchMode
searchMode: searchMode
},
body: ""
}
}
}
export const makeDOIPParams = (doId:string, op:string, attributes:Object, requestBody: string) => ({
action: "executeContract",
contractID: "BDBrowser",
operation: "sendRequestDirectly",
arg: {
id: doId,
doipUrl: iodConfig.gatewayUrl,
op: op,
attributes: attributes,
body: requestBody
export const makeDOIPParams = (
doId: string,
op: string,
attributes: Object,
requestBody: string
) => {
const iodConfig = getIodConfig()
return {
action: "executeContract",
contractID: "BDBrowser",
operation: "sendRequestDirectly",
arg: {
id: doId,
doipUrl: iodConfig.gatewayUrl,
op: op,
attributes: attributes,
body: requestBody
}
}
})
export const retrieveDoc = function(doId: string) : Promise<Document> {
console.log("retriveDoc:"+doId)
const params = makeDOIPParams(doId,"Retrieve",{
bodyBase64Encoded: false
}, "");
const abortController = new AbortController()
setTimeout(() => abortController.abort(), 10000)
return fetch(iodConfig.doBrowser, {
method: "POST",
body: JSON.stringify(params),
signal: abortController.signal
}).then((response) => {
console.log("responseIn retrieveDoc:");
console.log(response);
return response.json()})
.then((res) => {
console.log("res:");
console.log(res.result.body);
//TODO
return {
metadata:{traceId:res.result.header.attributes?.traceId},
pageContent:res.result.body
}
})
}
export const updateInLocalRepo = function(historyId: string, requestBody: Object) : Promise<string> {
const params = makeDOIPParams(iodConfig.localRepository,"Update",{
"aiDialogID": historyId,
bodyBase64Encoded: false
}, JSON.stringify(requestBody));
export const retrieveDoc = function (doId: string): Promise<Document> {
const iodConfig = getIodConfig()
console.log("retriveDoc:" + doId)
const params = makeDOIPParams(
doId,
"Retrieve",
{
bodyBase64Encoded: false
},
""
)
const abortController = new AbortController()
setTimeout(() => abortController.abort(), 10000)
return fetch(iodConfig.doBrowser, {
method: "POST",
body: JSON.stringify(params),
signal: abortController.signal
}).then((response) => response.json())
.then((res) => {
console.log("update dialog:"+JSON.stringify(res))
return res.body;
})
.then((response) => {
console.log("responseIn retrieveDoc:")
console.log(response)
return response.json()
})
.then((res) => {
console.log("res:")
console.log(res.result.body)
//TODO
return {
metadata: { traceId: res.result.header.attributes?.traceId },
pageContent: res.result.body
}
})
}
export const updateDialog = async function(histroyId : string, botMessage: any): Promise<string> {
export const updateInLocalRepo = function (
historyId: string,
requestBody: Object
): Promise<string> {
const iodConfig = getIodConfig()
const params = makeDOIPParams(
iodConfig.localRepository,
"Update",
{
aiDialogID: historyId,
bodyBase64Encoded: false
},
JSON.stringify(requestBody)
)
const abortController = new AbortController()
setTimeout(() => abortController.abort(), 10000)
return fetch(iodConfig.doBrowser, {
method: "POST",
body: JSON.stringify(params),
signal: abortController.signal
})
.then((response) => response.json())
.then((res) => {
console.log("update dialog:" + JSON.stringify(res))
return res.body
})
}
export const updateDialog = async function (
histroyId: string,
botMessage: any
): Promise<string> {
//TODO @Nex confused by Message/MessageType in ./db/index.ts!
const db = new PageAssitDatabase()
const chatHistory = await db.getChatHistory(histroyId)
var userMessage = null;
for (var i=0;i<chatHistory.length;i++){
userMessage = chatHistory[i];
if (userMessage.role=='user') break;
var userMessage = null
for (var i = 0; i < chatHistory.length; i++) {
userMessage = chatHistory[i]
if (userMessage.role == "user") break
}
let updateBody:any = {};
let updateBody: any = {}
// !!!IMPORTANT!!! traceId = histroyId+"/"+userMessage.id;
// Update traceId in retrieveDoc!
updateBody.traceId = histroyId+"/"+userMessage.id;
updateBody.traceId = histroyId + "/" + userMessage.id
updateBody.question = {
"id": histroyId+"/"+userMessage.id,
"content": userMessage.content,
"tokenCount": userMessage.content.length
id: histroyId + "/" + userMessage.id,
content: userMessage.content,
tokenCount: userMessage.content.length
}
updateBody.answer = {
"id": histroyId+"/"+botMessage.id,
"content": botMessage.content,
"tokenCount": botMessage.content.length
id: histroyId + "/" + botMessage.id,
content: botMessage.content,
tokenCount: botMessage.content.length
}
//TODO set a correct model ID
updateBody.model = {"id":"bdware.ollama/" + userMessage.name}
updateBody.model = { id: "bdware.ollama/" + userMessage.name }
//TODO incorrect tokenCount calculated!!
updateBody.webSources = botMessage.webSources?.map((r) => ({
url: r.url,
tokenCount: r.url.length,
content: r.url,
traceId: r?.traceId
})) ?? [];
updateBody.IoDSources = botMessage.iodSources?.map((r) => ({
id: r.doId,
tokenCount: (r.content || r.description)?calculateTokenCount((r.content || r.description)):0,
content: r.content || r.description,
traceId: r?.traceId
})) ?? [];
console.log("updateBody:");
updateBody.webSources =
botMessage.webSources?.map((r) => ({
url: r.url,
tokenCount: r.url.length,
content: r.url,
traceId: r?.traceId
})) ?? []
updateBody.IoDSources =
botMessage.iodSources?.map((r) => ({
id: r.doId,
tokenCount:
r.content || r.description
? calculateTokenCount(r.content || r.description)
: 0,
content: r.content || r.description,
traceId: r?.traceId
})) ?? []
console.log("updateBody:")
console.log(updateBody)
return updateInLocalRepo(histroyId,updateBody)
return updateInLocalRepo(histroyId, updateBody)
}
export async function localIodSearch(
query: string,
keywords: string[]
): Promise<IodRegistryEntry[]> {
): Promise<AllIodRegistryEntry> {
const iodConfig = getIodConfig()
const TOTAL_SEARCH_RESULTS = await totalSearchResults()
const abortController = new AbortController();
setTimeout(() => abortController.abort(), 10000);
const params = makeRegSearchParams(TOTAL_SEARCH_RESULTS, keywords);
const dataParams = makeSearchParamsWithDataType(TOTAL_SEARCH_RESULTS,keywords, "data");
const scenarioParams = makeSearchParamsWithDataType(TOTAL_SEARCH_RESULTS,keywords, "scenario");
const orgParams = makeSearchParamsWithDataType(TOTAL_SEARCH_RESULTS,keywords, "organization");
const abortController = new AbortController()
setTimeout(() => abortController.abort(), 10000)
const params = makeRegSearchParams(TOTAL_SEARCH_RESULTS, keywords)
const dataParams = makeSearchParamsWithDataType(
TOTAL_SEARCH_RESULTS,
keywords,
"data"
)
const scenarioParams = makeSearchParamsWithDataType(
TOTAL_SEARCH_RESULTS,
keywords,
"scenario"
)
const orgParams = makeSearchParamsWithDataType(
TOTAL_SEARCH_RESULTS,
keywords,
"organization"
)
try {
console.log('params------->',params)
console.log("params------->", params)
const requests = [
fetch(iodConfig.doBrowser,{method: "POST", body: JSON.stringify(dataParams),signal: abortController.signal}),
fetch(iodConfig.doBrowser,{method: "POST", body: JSON.stringify(scenarioParams),signal: abortController.signal}),
fetch(iodConfig.doBrowser,{method: "POST", body: JSON.stringify(orgParams),signal: abortController.signal})
];
fetch(iodConfig.doBrowser, {
method: "POST",
body: JSON.stringify(dataParams),
signal: abortController.signal
}),
fetch(iodConfig.doBrowser, {
method: "POST",
body: JSON.stringify(scenarioParams),
signal: abortController.signal
}),
fetch(iodConfig.doBrowser, {
method: "POST",
body: JSON.stringify(orgParams),
signal: abortController.signal
})
]
//TODO @Zhaoweijie 这三类分别是数据、场景、团队的搜索请求。
const responses = await Promise.all(requests);
const results = await Promise.all(responses.map(res => res.json()));
const allResults: IodRegistryEntry[] = [];
const responses = await Promise.all(requests)
const results = await Promise.all(responses.map((res) => res.json()))
const allResults: AllIodRegistryEntry = getDefaultIodSources()
let i = 0
for (const res of results) {
// 检查顶层状态
if (res.status !== "Success") {
continue; // 跳过失败的请求
continue // 跳过失败的请求
}
let body;
let body
try {
body = JSON.parse(res.result.body);
body = JSON.parse(res.result.body)
} catch (e) {
console.warn("Failed to parse result.body as JSON", e);
continue;
console.warn("Failed to parse result.body as JSON", e)
continue
}
if (body.code !== 0) {
continue;
continue
}
const entries: IodRegistryEntry[] = body.data?.results || [];
// 数据清洗:补全 url 和 doId
const entries: IodRegistryEntry[] = body.data?.results || []
const prunedEntries: IodRegistryEntry[] = []
const seenDoIds = new Set<string>()
// 数据清洗:补全 url 和 doId
for (const r of entries) {
r.url = r.url || r.pdf_url;
r.doId = r.doId || r.doid;
r.url = r.url || r.pdf_url
// @ts-ignore
r.doId = r.doId || r.doid
if (seenDoIds.has(r.doId)) {
continue
}
prunedEntries.push(r)
}
// 合并到总结果
allResults.push(...entries);
}
const seenDoIds = new Set<string>();
const prunedResults: IodRegistryEntry[] = [];
for (const r of allResults) {
if (r.doId && !seenDoIds.has(r.doId)) {
seenDoIds.add(r.doId);
prunedResults.push(r);
// 数据
if (i === 0) {
allResults.data = {
data: prunedEntries,
total: body.data?.total ?? 0
}
}
// 场景
if (i === 1) {
allResults.scenario = {
data: prunedEntries,
total: body.data?.total ?? 0
}
}
// 团队
if (i === 2) {
allResults.organization = {
data: prunedEntries,
total: body.data?.total ?? 0
}
}
i++
}
return prunedResults;
return allResults
} catch (e) {
console.log(e);
return [];
console.log(e)
return getDefaultIodSources()
}
}
const ARXIV_URL_PATTERN = /^https?:\/\/arxiv\.org\//
@@ -300,28 +405,41 @@ export const searchIod = async (query: string, keywords: string[]) => {
const searchResults = await localIodSearch(query, keywords)
const isSimpleMode = await getIsSimpleInternetSearch()
console.log("searchMode:"+isSimpleMode+"\n kw:"+JSON.stringify(keywords)+"\n"+" ->searchResult:\n"+JSON.stringify(searchResults))
console.log("pruned Search Result:"+JSON.stringify(searchResults.map(r=>r.doId+" "+r.name)))
console.log(
"searchMode:" +
isSimpleMode +
"\n kw:" +
JSON.stringify(keywords) +
"\n" +
" ->searchResult:\n" +
JSON.stringify(searchResults)
)
if (isSimpleMode) {
await getOllamaURL()
return searchResults
}
const docs: Document<Record<string, any>>[] = []
const resMap = new Map<string, IodRegistryEntry>()
for (const result of searchResults) {
for (const result of Object.values(searchResults)
.map((item) => item.data)
.flat()) {
const url = result.url
if (result.doId){
//TODO !!!!@Nex traceId should be the id of history/question!
let docFromRetrieve = await retrieveDoc(result.doId);
console.log("doc from Retrieve:"+result.doId+" -->"+JSON.stringify(docFromRetrieve))
docs.push(docFromRetrieve)
result.description = docFromRetrieve.pageContent;
result.traceId = docFromRetrieve.metadata?.traceId;
continue;
if (result.doId) {
//TODO !!!!@Nex traceId should be the id of history/question!
let docFromRetrieve = await retrieveDoc(result.doId)
console.log(
"doc from Retrieve:" +
result.doId +
" -->" +
JSON.stringify(docFromRetrieve)
)
docs.push(docFromRetrieve)
result.description = docFromRetrieve.pageContent
result.traceId = docFromRetrieve.metadata?.traceId
continue
}
if (!url) {
continue;
continue
}
let htmlUrl = ""
@@ -419,8 +537,7 @@ export const searchIod = async (query: string, keywords: string[]) => {
*/
}
export const calculateTokenCount = function(str:string){
const byteArray = new TextEncoder().encode(str);
return byteArray.length;
export const calculateTokenCount = function (str: string) {
const byteArray = new TextEncoder().encode(str)
return byteArray.length
}

File diff suppressed because one or more lines are too long