2025-02-14 18:17:12 +08:00
|
|
|
|
import { cleanUrl } from "@/libs/clean-url"
|
|
|
|
|
|
import { PageAssistHtmlLoader } from "@/loader/html"
|
2025-02-14 23:24:27 +08:00
|
|
|
|
import { PageAssistPDFUrlLoader } from "@/loader/pdf-url"
|
2025-02-14 18:17:12 +08:00
|
|
|
|
import { pageAssistEmbeddingModel } from "@/models/embedding"
|
|
|
|
|
|
import { defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
|
2025-08-17 22:39:12 +08:00
|
|
|
|
|
2025-02-14 18:17:12 +08:00
|
|
|
|
import {
|
|
|
|
|
|
getIsSimpleInternetSearch,
|
|
|
|
|
|
totalSearchResults
|
|
|
|
|
|
} from "@/services/search"
|
|
|
|
|
|
import { getPageAssistTextSplitter } from "@/utils/text-splitter"
|
2025-03-24 13:21:49 +08:00
|
|
|
|
import { Document } from "@langchain/core/documents"
|
2025-02-14 18:17:12 +08:00
|
|
|
|
import { MemoryVectorStore } from "langchain/vectorstores/memory"
|
2025-02-14 23:24:27 +08:00
|
|
|
|
import type { IodRegistryEntry } from "~/types/iod"
|
2025-02-14 18:17:12 +08:00
|
|
|
|
|
2025-03-24 13:21:49 +08:00
|
|
|
|
|
|
|
|
|
|
import { PageAssitDatabase } from "@/db"
|
2025-08-17 22:39:12 +08:00
|
|
|
|
import exp from "constants"
|
|
|
|
|
|
|
|
|
|
|
|
import { Segment, useDefault, cnPOSTag, enPOSTag} from 'segmentit';
|
|
|
|
|
|
const segment = useDefault(new Segment());
|
|
|
|
|
|
export const tokenizeInput = function (input: string): string[] {
|
|
|
|
|
|
const words = segment.doSegment(input, { simple: false });
|
|
|
|
|
|
console.log(words.map(function(word){return {w:word.w, p:enPOSTag(word.p)}}) );
|
|
|
|
|
|
return words.filter(word =>( word.w.length > 1)).map(word=>word.w);
|
|
|
|
|
|
}
|
2025-03-24 13:21:49 +08:00
|
|
|
|
//doipUrl = tcp://reg01.public.internetofdata.cn:21037
|
|
|
|
|
|
export const iodConfig = {
|
2025-08-20 18:36:48 +08:00
|
|
|
|
"gatewayUrl": "tcp://reg01.public.internetofdata.cn:21037",
|
2025-08-19 16:20:37 +08:00
|
|
|
|
"registry":"data/Registry",
|
|
|
|
|
|
"localRepository":"data/Repository",
|
|
|
|
|
|
"doBrowser":"http://021.node.internetapi.cn:21030/SCIDE/SCManager"
|
2025-03-24 13:21:49 +08:00
|
|
|
|
}
|
2025-08-20 18:36:48 +08:00
|
|
|
|
export const iodConfigLocal = {
|
|
|
|
|
|
"gatewayUrl": "tcp://127.0.0.1:21036",
|
|
|
|
|
|
"registry":"bdware/Registry",
|
|
|
|
|
|
"localRepository":"bdtest.local/myrepo1",
|
|
|
|
|
|
"doBrowser":"http://127.0.0.1:21030/SCIDE/SCManager"
|
|
|
|
|
|
}
|
2025-08-17 22:39:12 +08:00
|
|
|
|
function inGrepList(str: string){
|
|
|
|
|
|
return "什么|问题|需要|合适|设计|考虑|合作|精度|传感器|最新|研究|药物".indexOf(str)!=-1;
|
|
|
|
|
|
}
|
2025-08-20 18:36:48 +08:00
|
|
|
|
|
|
|
|
|
|
export const makeSearchParamsWithDataType = function(count: number, keyword: string| string[], dataType: string){
|
|
|
|
|
|
const searchMode = [];
|
|
|
|
|
|
searchMode.push({"key":"data_type", "type":"MUST", "value":dataType})
|
|
|
|
|
|
if (typeof keyword === 'string') {
|
|
|
|
|
|
// 如果 keyword 是字符串,则直接添加一个 searchMode 条目
|
|
|
|
|
|
searchMode.push({
|
|
|
|
|
|
key: "description",
|
|
|
|
|
|
type: "MUST",
|
|
|
|
|
|
value: keyword
|
|
|
|
|
|
});
|
|
|
|
|
|
} else if (Array.isArray(keyword)) {
|
|
|
|
|
|
// 如果 keyword 是数组,则为每个元素添加一个 searchMode 条目
|
|
|
|
|
|
keyword.forEach(str => {
|
|
|
|
|
|
if (!inGrepList(str))
|
|
|
|
|
|
searchMode.push({
|
|
|
|
|
|
key: "description",
|
|
|
|
|
|
type: "SHOULD",
|
|
|
|
|
|
value: str
|
|
|
|
|
|
});
|
|
|
|
|
|
});
|
|
|
|
|
|
}
|
|
|
|
|
|
return {
|
|
|
|
|
|
action: "executeContract",
|
|
|
|
|
|
contractID: "BDBrowser",
|
|
|
|
|
|
operation: "sendRequestDirectly",
|
|
|
|
|
|
arg: {
|
|
|
|
|
|
id: iodConfig.registry,
|
|
|
|
|
|
//doipUrl:"tcp://127.0.0.1:21039",
|
|
|
|
|
|
doipUrl: iodConfig.gatewayUrl,
|
|
|
|
|
|
op: "Search",
|
|
|
|
|
|
vars:{
|
|
|
|
|
|
timeout:15000
|
|
|
|
|
|
},
|
|
|
|
|
|
attributes: {
|
|
|
|
|
|
offset: 0,
|
|
|
|
|
|
count,
|
|
|
|
|
|
bodyBase64Encoded: false,
|
|
|
|
|
|
searchMode:searchMode
|
|
|
|
|
|
},
|
|
|
|
|
|
body: ""
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-08-17 22:39:12 +08:00
|
|
|
|
export const makeRegSearchParams = function(count: number, keyword: string| string[]){
|
|
|
|
|
|
const searchMode = [];
|
|
|
|
|
|
if (typeof keyword === 'string') {
|
|
|
|
|
|
// 如果 keyword 是字符串,则直接添加一个 searchMode 条目
|
|
|
|
|
|
searchMode.push({
|
|
|
|
|
|
key: "description",
|
|
|
|
|
|
type: "MUST",
|
|
|
|
|
|
value: keyword
|
|
|
|
|
|
});
|
|
|
|
|
|
} else if (Array.isArray(keyword)) {
|
|
|
|
|
|
// 如果 keyword 是数组,则为每个元素添加一个 searchMode 条目
|
|
|
|
|
|
keyword.forEach(str => {
|
|
|
|
|
|
if (!inGrepList(str))
|
|
|
|
|
|
searchMode.push({
|
|
|
|
|
|
key: "description",
|
|
|
|
|
|
type: "SHOULD",
|
|
|
|
|
|
value: str
|
|
|
|
|
|
});
|
|
|
|
|
|
});
|
2025-02-14 18:17:12 +08:00
|
|
|
|
}
|
2025-08-17 22:39:12 +08:00
|
|
|
|
return {
|
2025-03-24 13:21:49 +08:00
|
|
|
|
action: "executeContract",
|
|
|
|
|
|
contractID: "BDBrowser",
|
|
|
|
|
|
operation: "sendRequestDirectly",
|
|
|
|
|
|
arg: {
|
2025-08-17 22:39:12 +08:00
|
|
|
|
id: iodConfig.registry,
|
|
|
|
|
|
//doipUrl:"tcp://127.0.0.1:21039",
|
2025-03-24 13:21:49 +08:00
|
|
|
|
doipUrl: iodConfig.gatewayUrl,
|
2025-08-17 22:39:12 +08:00
|
|
|
|
op: "Search",
|
|
|
|
|
|
vars:{
|
|
|
|
|
|
timeout:15000
|
|
|
|
|
|
},
|
|
|
|
|
|
attributes: {
|
|
|
|
|
|
offset: 0,
|
|
|
|
|
|
count,
|
|
|
|
|
|
bodyBase64Encoded: false,
|
|
|
|
|
|
searchMode:searchMode
|
|
|
|
|
|
},
|
|
|
|
|
|
body: ""
|
2025-03-24 13:21:49 +08:00
|
|
|
|
}
|
2025-08-17 22:39:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-03-24 13:21:49 +08:00
|
|
|
|
|
2025-08-17 22:39:12 +08:00
|
|
|
|
export const makeDOIPParams = (doId:string, op:string, attributes:Object, requestBody: string) => ({
|
|
|
|
|
|
action: "executeContract",
|
|
|
|
|
|
contractID: "BDBrowser",
|
|
|
|
|
|
operation: "sendRequestDirectly",
|
|
|
|
|
|
arg: {
|
|
|
|
|
|
id: doId,
|
|
|
|
|
|
doipUrl: iodConfig.gatewayUrl,
|
|
|
|
|
|
op: op,
|
|
|
|
|
|
attributes: attributes,
|
|
|
|
|
|
body: requestBody
|
|
|
|
|
|
}
|
2025-08-19 16:20:37 +08:00
|
|
|
|
})
|
2025-08-17 22:39:12 +08:00
|
|
|
|
|
|
|
|
|
|
export const retrieveDoc = function(doId: string) : Promise<Document> {
|
|
|
|
|
|
console.log("retriveDoc:"+doId)
|
2025-03-24 13:21:49 +08:00
|
|
|
|
const params = makeDOIPParams(doId,"Retrieve",{
|
|
|
|
|
|
bodyBase64Encoded: false
|
|
|
|
|
|
}, "");
|
|
|
|
|
|
const abortController = new AbortController()
|
|
|
|
|
|
setTimeout(() => abortController.abort(), 10000)
|
|
|
|
|
|
return fetch(iodConfig.doBrowser, {
|
|
|
|
|
|
method: "POST",
|
|
|
|
|
|
body: JSON.stringify(params),
|
|
|
|
|
|
signal: abortController.signal
|
|
|
|
|
|
}).then((response) => {
|
|
|
|
|
|
console.log("responseIn retrieveDoc:");
|
|
|
|
|
|
console.log(response);
|
|
|
|
|
|
return response.json()})
|
|
|
|
|
|
.then((res) => {
|
|
|
|
|
|
console.log("res:");
|
|
|
|
|
|
console.log(res.result.body);
|
2025-08-17 22:39:12 +08:00
|
|
|
|
//TODO
|
|
|
|
|
|
return {
|
|
|
|
|
|
metadata:{traceId:res.result.header.attributes?.traceId},
|
|
|
|
|
|
pageContent:res.result.body
|
|
|
|
|
|
}
|
2025-03-24 13:21:49 +08:00
|
|
|
|
})
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
export const updateInLocalRepo = function(historyId: string, requestBody: Object) : Promise<string> {
|
|
|
|
|
|
const params = makeDOIPParams(iodConfig.localRepository,"Update",{
|
|
|
|
|
|
"aiDialogID": historyId,
|
|
|
|
|
|
bodyBase64Encoded: false
|
|
|
|
|
|
}, JSON.stringify(requestBody));
|
|
|
|
|
|
const abortController = new AbortController()
|
|
|
|
|
|
setTimeout(() => abortController.abort(), 10000)
|
|
|
|
|
|
return fetch(iodConfig.doBrowser, {
|
|
|
|
|
|
method: "POST",
|
|
|
|
|
|
body: JSON.stringify(params),
|
|
|
|
|
|
signal: abortController.signal
|
|
|
|
|
|
}).then((response) => response.json())
|
|
|
|
|
|
.then((res) => {
|
|
|
|
|
|
console.log("update dialog:"+JSON.stringify(res))
|
|
|
|
|
|
return res.body;
|
|
|
|
|
|
})
|
|
|
|
|
|
}
|
|
|
|
|
|
export const updateDialog = async function(histroyId : string, botMessage: any): Promise<string> {
|
|
|
|
|
|
//TODO @Nex confused by Message/MessageType in ./db/index.ts!
|
|
|
|
|
|
const db = new PageAssitDatabase()
|
|
|
|
|
|
const chatHistory = await db.getChatHistory(histroyId)
|
|
|
|
|
|
var userMessage = null;
|
|
|
|
|
|
for (var i=0;i<chatHistory.length;i++){
|
|
|
|
|
|
userMessage = chatHistory[i];
|
|
|
|
|
|
if (userMessage.role=='user') break;
|
|
|
|
|
|
}
|
|
|
|
|
|
let updateBody:any = {};
|
|
|
|
|
|
// !!!IMPORTANT!!! traceId = histroyId+"/"+userMessage.id;
|
|
|
|
|
|
// Update traceId in retrieveDoc!
|
|
|
|
|
|
updateBody.traceId = histroyId+"/"+userMessage.id;
|
|
|
|
|
|
updateBody.question = {
|
|
|
|
|
|
"id": histroyId+"/"+userMessage.id,
|
|
|
|
|
|
"content": userMessage.content,
|
|
|
|
|
|
"tokenCount": userMessage.content.length
|
|
|
|
|
|
}
|
|
|
|
|
|
updateBody.answer = {
|
|
|
|
|
|
"id": histroyId+"/"+botMessage.id,
|
|
|
|
|
|
"content": botMessage.content,
|
|
|
|
|
|
"tokenCount": botMessage.content.length
|
|
|
|
|
|
}
|
|
|
|
|
|
//TODO set a correct model ID
|
|
|
|
|
|
updateBody.model = {"id":"bdware.ollama/" + userMessage.name}
|
|
|
|
|
|
|
|
|
|
|
|
//TODO incorrect tokenCount calculated!!
|
|
|
|
|
|
updateBody.webSources = botMessage.webSources?.map((r) => ({
|
|
|
|
|
|
url: r.url,
|
|
|
|
|
|
tokenCount: r.url.length,
|
2025-08-17 22:39:12 +08:00
|
|
|
|
content: r.url,
|
|
|
|
|
|
traceId: r?.traceId
|
2025-03-24 13:21:49 +08:00
|
|
|
|
})) ?? [];
|
|
|
|
|
|
updateBody.IoDSources = botMessage.iodSources?.map((r) => ({
|
|
|
|
|
|
id: r.doId,
|
2025-08-17 22:39:12 +08:00
|
|
|
|
tokenCount: (r.content || r.description)?calculateTokenCount((r.content || r.description)):0,
|
|
|
|
|
|
content: r.content || r.description,
|
|
|
|
|
|
traceId: r?.traceId
|
2025-03-24 13:21:49 +08:00
|
|
|
|
})) ?? [];
|
|
|
|
|
|
console.log("updateBody:");
|
|
|
|
|
|
console.log(updateBody)
|
|
|
|
|
|
return updateInLocalRepo(histroyId,updateBody)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-02-14 23:24:27 +08:00
|
|
|
|
export async function localIodSearch(
|
|
|
|
|
|
query: string,
|
|
|
|
|
|
keywords: string[]
|
|
|
|
|
|
): Promise<IodRegistryEntry[]> {
|
2025-02-14 18:17:12 +08:00
|
|
|
|
const TOTAL_SEARCH_RESULTS = await totalSearchResults()
|
2025-08-17 22:39:12 +08:00
|
|
|
|
const abortController = new AbortController();
|
|
|
|
|
|
setTimeout(() => abortController.abort(), 10000);
|
|
|
|
|
|
const params = makeRegSearchParams(TOTAL_SEARCH_RESULTS, keywords);
|
2025-08-20 18:36:48 +08:00
|
|
|
|
const dataParams = makeSearchParamsWithDataType(TOTAL_SEARCH_RESULTS,keywords, "data");
|
|
|
|
|
|
const scenarioParams = makeSearchParamsWithDataType(TOTAL_SEARCH_RESULTS,keywords, "scenario");
|
|
|
|
|
|
const orgParams = makeSearchParamsWithDataType(TOTAL_SEARCH_RESULTS,keywords, "organization");
|
2025-08-17 22:39:12 +08:00
|
|
|
|
|
2025-08-20 18:36:48 +08:00
|
|
|
|
try {
|
|
|
|
|
|
console.log('params------->',params)
|
|
|
|
|
|
const requests = [
|
|
|
|
|
|
fetch(iodConfig.doBrowser,{method: "POST", body: JSON.stringify(dataParams),signal: abortController.signal}),
|
|
|
|
|
|
fetch(iodConfig.doBrowser,{method: "POST", body: JSON.stringify(scenarioParams),signal: abortController.signal}),
|
|
|
|
|
|
fetch(iodConfig.doBrowser,{method: "POST", body: JSON.stringify(orgParams),signal: abortController.signal})
|
|
|
|
|
|
];
|
|
|
|
|
|
//TODO @Zhaoweijie, 这三类分别是数据、场景、团队的搜索请求。
|
|
|
|
|
|
const responses = await Promise.all(requests);
|
|
|
|
|
|
const results = await Promise.all(responses.map(res => res.json()));
|
|
|
|
|
|
const allResults: IodRegistryEntry[] = [];
|
|
|
|
|
|
for (const res of results) {
|
|
|
|
|
|
// 检查顶层状态
|
|
|
|
|
|
if (res.status !== "Success") {
|
|
|
|
|
|
continue; // 跳过失败的请求
|
|
|
|
|
|
}
|
|
|
|
|
|
let body;
|
|
|
|
|
|
try {
|
|
|
|
|
|
body = JSON.parse(res.result.body);
|
|
|
|
|
|
} catch (e) {
|
|
|
|
|
|
console.warn("Failed to parse result.body as JSON", e);
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
2025-08-17 22:39:12 +08:00
|
|
|
|
|
2025-08-20 18:36:48 +08:00
|
|
|
|
if (body.code !== 0) {
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
const entries: IodRegistryEntry[] = body.data?.results || [];
|
|
|
|
|
|
// 数据清洗:补全 url 和 doId
|
|
|
|
|
|
for (const r of entries) {
|
|
|
|
|
|
r.url = r.url || r.pdf_url;
|
|
|
|
|
|
r.doId = r.doId || r.doid;
|
|
|
|
|
|
}
|
|
|
|
|
|
// 合并到总结果
|
|
|
|
|
|
allResults.push(...entries);
|
2025-08-17 22:39:12 +08:00
|
|
|
|
}
|
2025-08-20 18:36:48 +08:00
|
|
|
|
const seenDoIds = new Set<string>();
|
|
|
|
|
|
const prunedResults: IodRegistryEntry[] = [];
|
|
|
|
|
|
for (const r of allResults) {
|
|
|
|
|
|
if (r.doId && !seenDoIds.has(r.doId)) {
|
|
|
|
|
|
seenDoIds.add(r.doId);
|
|
|
|
|
|
prunedResults.push(r);
|
|
|
|
|
|
}
|
2025-08-17 22:39:12 +08:00
|
|
|
|
}
|
2025-08-20 18:36:48 +08:00
|
|
|
|
return prunedResults;
|
2025-08-17 22:39:12 +08:00
|
|
|
|
} catch (e) {
|
|
|
|
|
|
console.log(e);
|
|
|
|
|
|
return [];
|
|
|
|
|
|
}
|
2025-08-20 18:36:48 +08:00
|
|
|
|
|
|
|
|
|
|
|
2025-02-14 18:17:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-02-14 23:24:27 +08:00
|
|
|
|
const ARXIV_URL_PATTERN = /^https?:\/\/arxiv\.org\//
|
|
|
|
|
|
const ARXIV_NO_HTM = "No HTML for"
|
2025-02-14 18:17:12 +08:00
|
|
|
|
|
|
|
|
|
|
export const searchIod = async (query: string, keywords: string[]) => {
|
|
|
|
|
|
const searchResults = await localIodSearch(query, keywords)
|
|
|
|
|
|
|
|
|
|
|
|
const isSimpleMode = await getIsSimpleInternetSearch()
|
2025-08-17 22:39:12 +08:00
|
|
|
|
console.log("searchMode:"+isSimpleMode+"\n kw:"+JSON.stringify(keywords)+"\n"+" ->searchResult:\n"+JSON.stringify(searchResults))
|
2025-08-20 18:36:48 +08:00
|
|
|
|
console.log("pruned Search Result:"+JSON.stringify(searchResults.map(r=>r.doId+" "+r.name)))
|
2025-02-14 18:17:12 +08:00
|
|
|
|
if (isSimpleMode) {
|
|
|
|
|
|
await getOllamaURL()
|
|
|
|
|
|
return searchResults
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const docs: Document<Record<string, any>>[] = []
|
2025-02-14 23:24:27 +08:00
|
|
|
|
const resMap = new Map<string, IodRegistryEntry>()
|
2025-02-14 18:17:12 +08:00
|
|
|
|
for (const result of searchResults) {
|
2025-02-14 23:24:27 +08:00
|
|
|
|
const url = result.url
|
2025-03-24 13:21:49 +08:00
|
|
|
|
if (result.doId){
|
|
|
|
|
|
//TODO !!!!@Nex traceId should be the id of history/question!
|
2025-08-17 22:39:12 +08:00
|
|
|
|
let docFromRetrieve = await retrieveDoc(result.doId);
|
2025-03-24 13:21:49 +08:00
|
|
|
|
console.log("doc from Retrieve:"+result.doId+" -->"+JSON.stringify(docFromRetrieve))
|
|
|
|
|
|
docs.push(docFromRetrieve)
|
2025-08-17 22:39:12 +08:00
|
|
|
|
result.description = docFromRetrieve.pageContent;
|
|
|
|
|
|
result.traceId = docFromRetrieve.metadata?.traceId;
|
2025-03-24 13:21:49 +08:00
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (!url) {
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
2025-02-14 23:24:27 +08:00
|
|
|
|
|
|
|
|
|
|
let htmlUrl = ""
|
|
|
|
|
|
if (ARXIV_URL_PATTERN.test(url)) {
|
|
|
|
|
|
htmlUrl = url.replace("/pdf/", "/html/").replace(".pdf", "")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
let noHtml = htmlUrl === ""
|
|
|
|
|
|
if (!noHtml) {
|
|
|
|
|
|
const loader = new PageAssistHtmlLoader({
|
|
|
|
|
|
html: "",
|
|
|
|
|
|
url: htmlUrl
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
|
const documents = await loader.loadByURL()
|
|
|
|
|
|
for (const doc of documents) {
|
|
|
|
|
|
if (doc.pageContent.includes(ARXIV_NO_HTM)) {
|
|
|
|
|
|
noHtml = true
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
docs.push(doc)
|
|
|
|
|
|
}
|
|
|
|
|
|
} catch (e) {
|
|
|
|
|
|
console.log(e)
|
|
|
|
|
|
noHtml = true
|
|
|
|
|
|
}
|
2025-02-14 18:17:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-02-14 23:24:27 +08:00
|
|
|
|
if (noHtml) {
|
|
|
|
|
|
if (url.endsWith(".pdf")) {
|
|
|
|
|
|
const loader = new PageAssistPDFUrlLoader({
|
|
|
|
|
|
name: result.name,
|
|
|
|
|
|
url
|
|
|
|
|
|
})
|
2025-02-14 18:17:12 +08:00
|
|
|
|
|
2025-02-14 23:24:27 +08:00
|
|
|
|
try {
|
|
|
|
|
|
const documents = await loader.load()
|
|
|
|
|
|
for (const doc of documents) {
|
|
|
|
|
|
docs.push(doc)
|
|
|
|
|
|
}
|
|
|
|
|
|
} catch (e) {
|
|
|
|
|
|
console.log(e)
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
const loader = new PageAssistHtmlLoader({
|
|
|
|
|
|
html: "",
|
|
|
|
|
|
url
|
|
|
|
|
|
})
|
2025-02-14 18:17:12 +08:00
|
|
|
|
|
2025-02-14 23:24:27 +08:00
|
|
|
|
try {
|
|
|
|
|
|
const documents = await loader.loadByURL()
|
|
|
|
|
|
for (const doc of documents) {
|
|
|
|
|
|
docs.push(doc)
|
|
|
|
|
|
}
|
|
|
|
|
|
} catch (e) {
|
|
|
|
|
|
console.log(e)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-02-14 18:17:12 +08:00
|
|
|
|
}
|
2025-08-17 22:39:12 +08:00
|
|
|
|
return searchResults
|
2025-08-19 16:20:37 +08:00
|
|
|
|
|
2025-08-17 22:39:12 +08:00
|
|
|
|
/*
|
2025-02-14 18:17:12 +08:00
|
|
|
|
const ollamaUrl = await getOllamaURL()
|
|
|
|
|
|
|
|
|
|
|
|
const embeddingModle = await defaultEmbeddingModelForRag()
|
|
|
|
|
|
const ollamaEmbedding = await pageAssistEmbeddingModel({
|
|
|
|
|
|
model: embeddingModle || "",
|
|
|
|
|
|
baseUrl: cleanUrl(ollamaUrl)
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
const textSplitter = await getPageAssistTextSplitter()
|
|
|
|
|
|
|
|
|
|
|
|
const chunks = await textSplitter.splitDocuments(docs)
|
|
|
|
|
|
|
|
|
|
|
|
const store = new MemoryVectorStore(ollamaEmbedding)
|
|
|
|
|
|
|
|
|
|
|
|
await store.addDocuments(chunks)
|
|
|
|
|
|
|
|
|
|
|
|
const resultsWithEmbeddings = await store.similaritySearch(query, 3)
|
|
|
|
|
|
|
|
|
|
|
|
const searchResult = resultsWithEmbeddings.map((result) => {
|
2025-02-14 23:24:27 +08:00
|
|
|
|
// `source` for PDF type
|
|
|
|
|
|
const key = result.metadata.url || result.metadata.source
|
|
|
|
|
|
if (!key) return null
|
|
|
|
|
|
const fullRes = resMap[key]
|
2025-02-14 18:17:12 +08:00
|
|
|
|
return {
|
2025-02-14 23:24:27 +08:00
|
|
|
|
...fullRes,
|
2025-02-14 18:17:12 +08:00
|
|
|
|
content: result.pageContent
|
|
|
|
|
|
}
|
2025-02-14 23:24:27 +08:00
|
|
|
|
}).filter((r) => r)
|
2025-02-14 18:17:12 +08:00
|
|
|
|
|
|
|
|
|
|
return searchResult
|
2025-08-17 22:39:12 +08:00
|
|
|
|
*/
|
2025-02-14 18:17:12 +08:00
|
|
|
|
}
|
2025-08-17 22:39:12 +08:00
|
|
|
|
|
|
|
|
|
|
export const calculateTokenCount = function(str:string){
|
|
|
|
|
|
const byteArray = new TextEncoder().encode(str);
|
|
|
|
|
|
return byteArray.length;
|
|
|
|
|
|
}
|
|
|
|
|
|
|