import type { WebSearchScrapedSource, WebSearchSource } from "$lib/types/WebSearch"; import type { MessageWebSearchUpdate } from "$lib/types/MessageUpdate"; import { withPage } from "./playwright"; import { spatialParser } from "./parser"; import { htmlToMarkdownTree } from "../markdown/tree"; import { timeout } from "$lib/utils/timeout"; import { makeGeneralUpdate } from "../update"; import { MetricsServer } from "$lib/server/metrics"; import { logger } from "$lib/server/logger"; export const scrape = (maxCharsPerElem: number) => async function* ( source: WebSearchSource ): AsyncGenerator { try { const startTime = Date.now(); MetricsServer.getMetrics().webSearch.pageFetchCount.inc(); const page = await scrapeUrl(source.link, maxCharsPerElem); MetricsServer.getMetrics().webSearch.pageFetchDuration.observe(Date.now() - startTime); yield makeGeneralUpdate({ message: "Browsing webpage", args: [source.link], }); return { ...source, page }; } catch (e) { MetricsServer.getMetrics().webSearch.pageFetchCountError.inc(); logger.error(e, `Error scraping webpage: ${source.link}`); } }; export async function scrapeUrl(url: string, maxCharsPerElem: number) { return withPage(url, async (page, res) => { if (!res) throw Error("Failed to load page"); if (!res.ok()) throw Error(`Failed to load page: ${res.status()}`); // Check if it's a non-html content type that we can handle directly // TODO: direct mappings to markdown can be added for markdown, csv and others const contentType = res.headers()["content-type"] ?? ""; if ( contentType.includes("text/plain") || contentType.includes("text/markdown") || contentType.includes("application/json") || contentType.includes("application/xml") || contentType.includes("text/csv") ) { const title = await page.title(); const content = await page.content(); return { title, markdownTree: htmlToMarkdownTree( title, [{ tagName: "p", attributes: {}, content: [content] }], maxCharsPerElem ), }; } const scrapedOutput = await timeout(page.evaluate(spatialParser), 2000) .then(({ elements, ...parsed }) => ({ ...parsed, markdownTree: htmlToMarkdownTree(parsed.title, elements, maxCharsPerElem), })) .catch((cause) => { throw Error("Parsing failed", { cause }); }); return scrapedOutput; }); }