Spaces:

jdelavande
/

chat-ui-energy

Running on CPU Upgrade

App Files Files Community

Liam Dyer

nsarrazin HF Staff commited on May 27, 2024

Commit

aee936e

unverified ·

1 Parent(s): 6e18e46

Fix Playwright leak and use multiple contexts (#1187)

Browse files

* fix: playwright leak and use multiple contexts

* lint

---------

Co-authored-by: Nathan Sarrazin <[email protected]>

Files changed (2) hide show

src/lib/server/websearch/scrape/playwright.ts +36 -29
src/lib/server/websearch/scrape/scrape.ts +3 -7

src/lib/server/websearch/scrape/playwright.ts CHANGED Viewed

@@ -1,23 +1,35 @@
 import {
-	type BrowserContext,
 	chromium,
 	devices,
 	type Page,
 	type BrowserContextOptions,
 	type Response,
 } from "playwright";
 import { PlaywrightBlocker } from "@cliqz/adblocker-playwright";
 import { env } from "$env/dynamic/private";
-// Singleton initialized by initPlaywrightService
-let playwrightService: Promise<{ ctx: BrowserContext; blocker: PlaywrightBlocker }> | undefined;
-async function initPlaywrightService() {
-	if (playwrightService) return playwrightService;
 	const browser = await chromium.launch({ headless: true });
 	process.on("SIGINT", () => browser.close());
 	const device = devices["Desktop Chrome"];
 	const options: BrowserContextOptions = {
@@ -36,31 +48,26 @@ async function initPlaywrightService() {
 		timezoneId: "America/New_York",
 		locale: "en-US",
 	};
-	const ctx = await browser.newContext(options);
-	const blocker = await PlaywrightBlocker.fromPrebuiltAdsAndTracking(fetch).then((blker) => {
-		const mostBlocked = blker.blockFonts().blockMedias().blockFrames().blockImages();
-		if (env.WEBSEARCH_JAVASCRIPT === "false") return mostBlocked.blockScripts();
-		return mostBlocked;
-	});
-	// Clear the singleton when the context closes
-	ctx.on("close", () => {
-		playwrightService = undefined;
-	});
-	return Object.freeze({ ctx, blocker });
 }
-export async function loadPage(url: string): Promise<{ res?: Response; page: Page }> {
-	if (!playwrightService) playwrightService = initPlaywrightService();
-	const { ctx, blocker } = await playwrightService;
-	const page = await ctx.newPage();
-	await blocker.enableBlockingInPage(page);
-	const res = await page.goto(url, { waitUntil: "load", timeout: 3500 }).catch(() => {
-		console.warn(`Failed to load page within 2s: ${url}`);
-	});
-	return { res: res ?? undefined, page };
 }

 import {
 	chromium,
 	devices,
 	type Page,
 	type BrowserContextOptions,
 	type Response,
+	type Browser,
 } from "playwright";
 import { PlaywrightBlocker } from "@cliqz/adblocker-playwright";
 import { env } from "$env/dynamic/private";
+import { logger } from "$lib/server/logger";
+const blocker = await PlaywrightBlocker.fromPrebuiltAdsAndTracking(fetch).then((blker) => {
+	const mostBlocked = blker.blockFonts().blockMedias().blockFrames().blockImages();
+	if (env.WEBSEARCH_JAVASCRIPT === "false") return mostBlocked.blockScripts();
+	return mostBlocked;
+});
+let browserSingleton: Promise<Browser> | undefined;
+async function getBrowser() {
 	const browser = await chromium.launch({ headless: true });
 	process.on("SIGINT", () => browser.close());
+	browser.on("disconnected", () => {
+		logger.warn("Browser closed");
+		browserSingleton = undefined;
+	});
+	return browser;
+}
+async function getPlaywrightCtx() {
+	if (!browserSingleton) browserSingleton = getBrowser();
+	const browser = await browserSingleton;
 	const device = devices["Desktop Chrome"];
 	const options: BrowserContextOptions = {
 		timezoneId: "America/New_York",
 		locale: "en-US",
 	};
+	return browser.newContext(options);
 }
+export async function withPage<T>(
+	url: string,
+	callback: (page: Page, response?: Response) => Promise<T>
+): Promise<T> {
+	const ctx = await getPlaywrightCtx();
+	try {
+		const page = await ctx.newPage();
+		await blocker.enableBlockingInPage(page);
+		const res = await page.goto(url, { waitUntil: "load", timeout: 3500 }).catch(() => {
+			console.warn(`Failed to load page within 2s: ${url}`);
+		});
+		// await needed here so that we don't close the context before the callback is done
+		return await callback(page, res ?? undefined);
+	} finally {
+		ctx.close();
+	}
 }

src/lib/server/websearch/scrape/scrape.ts CHANGED Viewed

@@ -1,6 +1,6 @@
 import type { WebSearchScrapedSource, WebSearchSource } from "$lib/types/WebSearch";
 import type { MessageWebSearchUpdate } from "$lib/types/MessageUpdate";
-import { loadPage } from "./playwright";
 import { spatialParser } from "./parser";
 import { htmlToMarkdownTree } from "../markdown/tree";
@@ -30,9 +30,7 @@ export const scrape = (maxCharsPerElem: number) =>
 	};
 export async function scrapeUrl(url: string, maxCharsPerElem: number) {
-	const { res, page } = await loadPage(url);
-	try {
 		if (!res) throw Error("Failed to load page");
 		// Check if it's a non-html content type that we can handle directly
@@ -66,7 +64,5 @@ export async function scrapeUrl(url: string, maxCharsPerElem: number) {
 				throw Error("Parsing failed", { cause });
 			});
 		return scrapedOutput;
-	} finally {
-		page.close();
-	}
 }

 import type { WebSearchScrapedSource, WebSearchSource } from "$lib/types/WebSearch";
 import type { MessageWebSearchUpdate } from "$lib/types/MessageUpdate";
+import { withPage } from "./playwright";
 import { spatialParser } from "./parser";
 import { htmlToMarkdownTree } from "../markdown/tree";
 	};
 export async function scrapeUrl(url: string, maxCharsPerElem: number) {
+	return withPage(url, async (page, res) => {
 		if (!res) throw Error("Failed to load page");
 		// Check if it's a non-html content type that we can handle directly
 				throw Error("Parsing failed", { cause });
 			});
 		return scrapedOutput;
+	});
 }