import { cleanUrl } from "@/libs/clean-url" import { PageAssistHtmlLoader } from "@/loader/html" import { pageAssistEmbeddingModel } from "@/models/embedding" import { defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" import { getIsSimpleInternetSearch, totalSearchResults } from "@/services/search" import { getPageAssistTextSplitter } from "@/utils/text-splitter" import type { Document } from "@langchain/core/documents" import { MemoryVectorStore } from "langchain/vectorstores/memory" const makeRegSearchParams = (count: number, keyword: string) => ({ action: "executeContract", contractID: "BDBrowser", operation: "sendRequestDirectly", arg: { id: "670E241C9937B3537047C87053E3AA36", doipUrl: "tcp://reg01.public.internetofdata.cn:21037", op: "Search", attributes: { offset: 0, count, bodyBase64Encoded: false, searchMode: [ { key: "data_type", type: "MUST", value: "paper" }, // { // key: "title", // type: "MUST", // value: keyword, // }, { key: "description", type: "MUST", value: keyword } ] }, body: "" } }) export const localIodSearch = async (query: string, keywords: string[]) => { const TOTAL_SEARCH_RESULTS = await totalSearchResults() const results = ( await Promise.all( keywords.map(async (keyword) => { const abortController = new AbortController() setTimeout(() => abortController.abort(), 10000) const params = makeRegSearchParams(TOTAL_SEARCH_RESULTS, keyword) return fetch("http://47.93.156.31:21033/SCIDE/SCManager", { method: "POST", body: JSON.stringify(params), signal: abortController.signal }) .then((response) => response.json()) .then((res) => { if (res.status !== "Success") { console.log(res) return [] } const body = JSON.parse(res.result.body) if (body.code !== 0) { console.log(body) return [] } const results = body.data?.results?.filter((r) => r.url || r.pdf_url) || [] results.forEach((r) => { r.url = r.url || r.pdf_url }) return results }) .catch((e) => { console.log(e) return [] }) }) ) ).flat() return results } const ARXIV_URL = /^https:\/\/arxiv.org\// export const searchIod = async (query: string, keywords: string[]) => { const searchResults = await localIodSearch(query, keywords) const isSimpleMode = await getIsSimpleInternetSearch() if (isSimpleMode) { await getOllamaURL() return searchResults } const docs: Document>[] = [] for (const result of searchResults) { let url = result.url if (ARXIV_URL.test(result.url)) { url = result.url.replace("/pdf/", "/abs/").replace(".pdf", "") } const loader = new PageAssistHtmlLoader({ html: "", url }) const documents = await loader.loadByURL() documents.forEach((doc) => { docs.push(doc) }) } const ollamaUrl = await getOllamaURL() const embeddingModle = await defaultEmbeddingModelForRag() const ollamaEmbedding = await pageAssistEmbeddingModel({ model: embeddingModle || "", baseUrl: cleanUrl(ollamaUrl) }) const textSplitter = await getPageAssistTextSplitter() const chunks = await textSplitter.splitDocuments(docs) const store = new MemoryVectorStore(ollamaEmbedding) await store.addDocuments(chunks) const resultsWithEmbeddings = await store.similaritySearch(query, 3) const searchResult = resultsWithEmbeddings.map((result) => { return { url: result.metadata.url, content: result.pageContent } }) return searchResult }