Spaces:
Running
Running
File size: 2,393 Bytes
2c00ea8 564e576 aee936e 2c00ea8 76a7af0 6e18e46 76a7af0 2c00ea8 564e576 2c00ea8 6e18e46 2c00ea8 6e18e46 76a7af0 2c00ea8 6e18e46 dc98038 2c00ea8 aee936e 853f8fc 564e576 853f8fc 564e576 853f8fc 2c00ea8 853f8fc aee936e 2c00ea8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import type { WebSearchScrapedSource, WebSearchSource } from "$lib/types/WebSearch";
import type { MessageWebSearchUpdate } from "$lib/types/MessageUpdate";
import { withPage } from "./playwright";
import { spatialParser } from "./parser";
import { htmlToMarkdownTree } from "../markdown/tree";
import { timeout } from "$lib/utils/timeout";
import { makeGeneralUpdate } from "../update";
import { MetricsServer } from "$lib/server/metrics";
import { logger } from "$lib/server/logger";
export const scrape = (maxCharsPerElem: number) =>
async function* (
source: WebSearchSource
): AsyncGenerator<MessageWebSearchUpdate, WebSearchScrapedSource | undefined, undefined> {
try {
const startTime = Date.now();
MetricsServer.getMetrics().webSearch.pageFetchCount.inc();
const page = await scrapeUrl(source.link, maxCharsPerElem);
MetricsServer.getMetrics().webSearch.pageFetchDuration.observe(Date.now() - startTime);
yield makeGeneralUpdate({
message: "Browsing webpage",
args: [source.link],
});
return { ...source, page };
} catch (e) {
MetricsServer.getMetrics().webSearch.pageFetchCountError.inc();
logger.error(e, `Error scraping webpage: ${source.link}`);
}
};
export async function scrapeUrl(url: string, maxCharsPerElem: number) {
return withPage(url, async (page, res) => {
if (!res) throw Error("Failed to load page");
// Check if it's a non-html content type that we can handle directly
// TODO: direct mappings to markdown can be added for markdown, csv and others
const contentType = res.headers()["content-type"] ?? "";
if (
contentType.includes("text/plain") ||
contentType.includes("text/markdown") ||
contentType.includes("application/json") ||
contentType.includes("application/xml") ||
contentType.includes("text/csv")
) {
const title = await page.title();
const content = await page.content();
return {
title,
markdownTree: htmlToMarkdownTree(
title,
[{ tagName: "p", attributes: {}, content: [content] }],
maxCharsPerElem
),
};
}
const scrapedOutput = await timeout(page.evaluate(spatialParser), 2000)
.then(({ elements, ...parsed }) => ({
...parsed,
markdownTree: htmlToMarkdownTree(parsed.title, elements, maxCharsPerElem),
}))
.catch((cause) => {
throw Error("Parsing failed", { cause });
});
return scrapedOutput;
});
}
|