2024-02-02 22:01:16 +05:30
|
|
|
import { BaseDocumentLoader } from "langchain/document_loaders/base"
|
2024-02-03 17:51:11 +05:30
|
|
|
import { Document } from "@langchain/core/documents"
|
2024-02-02 22:01:16 +05:30
|
|
|
import { compile } from "html-to-text"
|
2024-02-25 00:12:46 +05:30
|
|
|
import { chromeRunTime } from "~libs/runtime"
|
2024-02-02 22:01:16 +05:30
|
|
|
|
2024-02-25 00:12:46 +05:30
|
|
|
const isPDFFetch = async (url: string) => {
|
|
|
|
|
await chromeRunTime(url)
|
|
|
|
|
const response = await fetch(url)
|
|
|
|
|
const blob = await response.blob()
|
|
|
|
|
return blob.type === "application/pdf"
|
|
|
|
|
}
|
2024-02-02 22:01:16 +05:30
|
|
|
export interface WebLoaderParams {
|
|
|
|
|
html: string
|
|
|
|
|
url: string
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export class PageAssistHtmlLoader
|
|
|
|
|
extends BaseDocumentLoader
|
|
|
|
|
implements WebLoaderParams
|
|
|
|
|
{
|
|
|
|
|
html: string
|
|
|
|
|
url: string
|
|
|
|
|
|
|
|
|
|
constructor({ html, url }: WebLoaderParams) {
|
|
|
|
|
super()
|
|
|
|
|
this.html = html
|
|
|
|
|
this.url = url
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async load(): Promise<Document<Record<string, any>>[]> {
|
|
|
|
|
const htmlCompiler = compile({
|
|
|
|
|
wordwrap: false
|
|
|
|
|
})
|
|
|
|
|
const text = htmlCompiler(this.html)
|
|
|
|
|
const metadata = { source: this.url }
|
|
|
|
|
return [new Document({ pageContent: text, metadata })]
|
|
|
|
|
}
|
|
|
|
|
}
|