Files
page-assist/src/web/iod.ts

330 lines
9.3 KiB
TypeScript
Raw Normal View History

2025-02-14 18:17:12 +08:00
import { cleanUrl } from "@/libs/clean-url"
import { PageAssistHtmlLoader } from "@/loader/html"
import { PageAssistPDFUrlLoader } from "@/loader/pdf-url"
2025-02-14 18:17:12 +08:00
import { pageAssistEmbeddingModel } from "@/models/embedding"
import { defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
import {
getIsSimpleInternetSearch,
totalSearchResults
} from "@/services/search"
import { getPageAssistTextSplitter } from "@/utils/text-splitter"
2025-03-24 13:21:49 +08:00
import { Document } from "@langchain/core/documents"
2025-02-14 18:17:12 +08:00
import { MemoryVectorStore } from "langchain/vectorstores/memory"
import type { IodRegistryEntry } from "~/types/iod"
2025-02-14 18:17:12 +08:00
2025-03-24 13:21:49 +08:00
import { PageAssitDatabase } from "@/db"
//doipUrl = tcp://reg01.public.internetofdata.cn:21037
export const iodConfig = {
"gatewayUrl": "tcp://127.0.0.1:21051",
"registry":"bdware/Registry",
"localRepository":"bdtest.local/myrepo1",
"doBrowser":"http://127.0.0.1:21030/SCIDE/SCManager"
}
export const makeRegSearchParams = (count: number, keyword: string) => ({
2025-02-14 18:17:12 +08:00
action: "executeContract",
contractID: "BDBrowser",
operation: "sendRequestDirectly",
arg: {
2025-03-24 13:21:49 +08:00
id: iodConfig.registry,
doipUrl: iodConfig.gatewayUrl,
2025-02-14 18:17:12 +08:00
op: "Search",
attributes: {
offset: 0,
count,
bodyBase64Encoded: false,
searchMode: [
{
key: "data_type",
type: "MUST",
value: "paper"
},
// {
// key: "title",
// type: "MUST",
// value: keyword,
// },
{
key: "description",
type: "MUST",
value: keyword
}
]
},
body: ""
}
})
2025-03-24 13:21:49 +08:00
export const makeDOIPParams = (doId:string, op:string, attributes:Object, requestBody: string) => ({
action: "executeContract",
contractID: "BDBrowser",
operation: "sendRequestDirectly",
arg: {
id: doId,
doipUrl: iodConfig.gatewayUrl,
op: op,
attributes: attributes,
body: requestBody
}
})
export const retrieveDoc = function(doId: string, traceId: string) : Promise<Document> {
console.log("retriveDoc:"+doId+" -> traceId:"+traceId)
const params = makeDOIPParams(doId,"Retrieve",{
"traceId": traceId,
bodyBase64Encoded: false
}, "");
const abortController = new AbortController()
setTimeout(() => abortController.abort(), 10000)
return fetch(iodConfig.doBrowser, {
method: "POST",
body: JSON.stringify(params),
signal: abortController.signal
}).then((response) => {
console.log("responseIn retrieveDoc:");
console.log(response);
return response.json()})
.then((res) => {
console.log("res:");
console.log(res.result.body);
return res.result.body
})
}
export const updateInLocalRepo = function(historyId: string, requestBody: Object) : Promise<string> {
const params = makeDOIPParams(iodConfig.localRepository,"Update",{
"aiDialogID": historyId,
bodyBase64Encoded: false
}, JSON.stringify(requestBody));
const abortController = new AbortController()
setTimeout(() => abortController.abort(), 10000)
return fetch(iodConfig.doBrowser, {
method: "POST",
body: JSON.stringify(params),
signal: abortController.signal
}).then((response) => response.json())
.then((res) => {
console.log("update dialog:"+JSON.stringify(res))
return res.body;
})
}
export const updateDialog = async function(histroyId : string, botMessage: any): Promise<string> {
//TODO @Nex confused by Message/MessageType in ./db/index.ts!
const db = new PageAssitDatabase()
const chatHistory = await db.getChatHistory(histroyId)
var userMessage = null;
for (var i=0;i<chatHistory.length;i++){
userMessage = chatHistory[i];
if (userMessage.role=='user') break;
}
let updateBody:any = {};
console.log(userMessage)
console.log(botMessage)
// !!!IMPORTANT!!! traceId = histroyId+"/"+userMessage.id;
// Update traceId in retrieveDoc!
updateBody.traceId = histroyId+"/"+userMessage.id;
updateBody.question = {
"id": histroyId+"/"+userMessage.id,
"content": userMessage.content,
"tokenCount": userMessage.content.length
}
updateBody.answer = {
"id": histroyId+"/"+botMessage.id,
"content": botMessage.content,
"tokenCount": botMessage.content.length
}
//TODO set a correct model ID
updateBody.model = {"id":"bdware.ollama/" + userMessage.name}
//TODO incorrect tokenCount calculated!!
updateBody.webSources = botMessage.webSources?.map((r) => ({
url: r.url,
tokenCount: r.url.length,
content: r.url
})) ?? [];
updateBody.IoDSources = botMessage.iodSources?.map((r) => ({
id: r.doId,
tokenCount: r.description.length,
content: r.description
})) ?? [];
console.log("updateBody:");
console.log(updateBody)
return updateInLocalRepo(histroyId,updateBody)
}
export async function localIodSearch(
query: string,
keywords: string[]
): Promise<IodRegistryEntry[]> {
2025-02-14 18:17:12 +08:00
const TOTAL_SEARCH_RESULTS = await totalSearchResults()
const results = (
await Promise.all(
keywords.map(async (keyword) => {
const abortController = new AbortController()
setTimeout(() => abortController.abort(), 10000)
2025-03-24 13:21:49 +08:00
//http://47.93.156.31:21033/SCIDE/SCManager
2025-02-14 18:17:12 +08:00
const params = makeRegSearchParams(TOTAL_SEARCH_RESULTS, keyword)
2025-03-24 13:21:49 +08:00
return fetch(iodConfig.doBrowser, {
2025-02-14 18:17:12 +08:00
method: "POST",
body: JSON.stringify(params),
signal: abortController.signal
})
.then((response) => response.json())
.then((res) => {
if (res.status !== "Success") {
console.log(res)
return []
}
const body = JSON.parse(res.result.body)
if (body.code !== 0) {
console.log(body)
return []
}
2025-03-24 13:21:49 +08:00
const results: IodRegistryEntry[] = body.data?.results || []
for (const r of results) {
2025-02-14 18:17:12 +08:00
r.url = r.url || r.pdf_url
}
2025-02-14 18:17:12 +08:00
return results
})
.catch((e) => {
console.log(e)
return []
})
})
)
).flat()
2025-02-23 13:02:32 +08:00
// results 根据 doId 去重
const map = new Map<string, IodRegistryEntry>()
for (const r of results) {
map.set(r.doId, r)
}
2025-03-24 13:21:49 +08:00
console.log("result from IoD:"+JSON.stringify(map)+"--> kw:"+JSON.stringify(keywords));
2025-02-23 13:02:32 +08:00
return Array.from(map.values())
2025-02-14 18:17:12 +08:00
}
const ARXIV_URL_PATTERN = /^https?:\/\/arxiv\.org\//
const ARXIV_NO_HTM = "No HTML for"
2025-02-14 18:17:12 +08:00
export const searchIod = async (query: string, keywords: string[]) => {
const searchResults = await localIodSearch(query, keywords)
const isSimpleMode = await getIsSimpleInternetSearch()
2025-03-24 13:21:49 +08:00
console.log("searchMode:"+isSimpleMode+" ->searchResult:\n"+JSON.stringify(searchResults))
2025-02-14 18:17:12 +08:00
if (isSimpleMode) {
await getOllamaURL()
return searchResults
}
const docs: Document<Record<string, any>>[] = []
const resMap = new Map<string, IodRegistryEntry>()
2025-02-14 18:17:12 +08:00
for (const result of searchResults) {
const url = result.url
2025-03-24 13:21:49 +08:00
if (result.doId){
//TODO !!!!@Nex traceId should be the id of history/question!
const traceId = new Date().getTime() + "";
let docFromRetrieve = await retrieveDoc(result.doId, traceId);
console.log("doc from Retrieve:"+result.doId+" -->"+JSON.stringify(docFromRetrieve))
docs.push(docFromRetrieve)
continue;
}
if (!url) {
continue;
}
let htmlUrl = ""
if (ARXIV_URL_PATTERN.test(url)) {
htmlUrl = url.replace("/pdf/", "/html/").replace(".pdf", "")
}
let noHtml = htmlUrl === ""
if (!noHtml) {
const loader = new PageAssistHtmlLoader({
html: "",
url: htmlUrl
})
try {
const documents = await loader.loadByURL()
for (const doc of documents) {
if (doc.pageContent.includes(ARXIV_NO_HTM)) {
noHtml = true
return
}
docs.push(doc)
}
} catch (e) {
console.log(e)
noHtml = true
}
2025-02-14 18:17:12 +08:00
}
if (noHtml) {
if (url.endsWith(".pdf")) {
const loader = new PageAssistPDFUrlLoader({
name: result.name,
url
})
2025-02-14 18:17:12 +08:00
try {
const documents = await loader.load()
for (const doc of documents) {
docs.push(doc)
}
} catch (e) {
console.log(e)
}
} else {
const loader = new PageAssistHtmlLoader({
html: "",
url
})
2025-02-14 18:17:12 +08:00
try {
const documents = await loader.loadByURL()
for (const doc of documents) {
docs.push(doc)
}
} catch (e) {
console.log(e)
}
}
}
2025-02-14 18:17:12 +08:00
}
const ollamaUrl = await getOllamaURL()
const embeddingModle = await defaultEmbeddingModelForRag()
const ollamaEmbedding = await pageAssistEmbeddingModel({
model: embeddingModle || "",
baseUrl: cleanUrl(ollamaUrl)
})
const textSplitter = await getPageAssistTextSplitter()
const chunks = await textSplitter.splitDocuments(docs)
const store = new MemoryVectorStore(ollamaEmbedding)
await store.addDocuments(chunks)
const resultsWithEmbeddings = await store.similaritySearch(query, 3)
const searchResult = resultsWithEmbeddings.map((result) => {
// `source` for PDF type
const key = result.metadata.url || result.metadata.source
if (!key) return null
const fullRes = resMap[key]
2025-02-14 18:17:12 +08:00
return {
...fullRes,
2025-02-14 18:17:12 +08:00
content: result.pageContent
}
}).filter((r) => r)
2025-02-14 18:17:12 +08:00
return searchResult
}