Liam Dyer nsarrazin HF Staff commited on
Commit
aee936e
·
unverified ·
1 Parent(s): 6e18e46

Fix Playwright leak and use multiple contexts (#1187)

Browse files

* fix: playwright leak and use multiple contexts

* lint

---------

Co-authored-by: Nathan Sarrazin <[email protected]>

src/lib/server/websearch/scrape/playwright.ts CHANGED
@@ -1,23 +1,35 @@
1
  import {
2
- type BrowserContext,
3
  chromium,
4
  devices,
5
  type Page,
6
  type BrowserContextOptions,
7
  type Response,
 
8
  } from "playwright";
9
  import { PlaywrightBlocker } from "@cliqz/adblocker-playwright";
10
  import { env } from "$env/dynamic/private";
 
11
 
12
- // Singleton initialized by initPlaywrightService
13
- let playwrightService: Promise<{ ctx: BrowserContext; blocker: PlaywrightBlocker }> | undefined;
14
-
15
- async function initPlaywrightService() {
16
- if (playwrightService) return playwrightService;
17
 
 
 
18
  const browser = await chromium.launch({ headless: true });
19
-
20
  process.on("SIGINT", () => browser.close());
 
 
 
 
 
 
 
 
 
 
21
 
22
  const device = devices["Desktop Chrome"];
23
  const options: BrowserContextOptions = {
@@ -36,31 +48,26 @@ async function initPlaywrightService() {
36
  timezoneId: "America/New_York",
37
  locale: "en-US",
38
  };
39
- const ctx = await browser.newContext(options);
40
- const blocker = await PlaywrightBlocker.fromPrebuiltAdsAndTracking(fetch).then((blker) => {
41
- const mostBlocked = blker.blockFonts().blockMedias().blockFrames().blockImages();
42
- if (env.WEBSEARCH_JAVASCRIPT === "false") return mostBlocked.blockScripts();
43
- return mostBlocked;
44
- });
45
-
46
- // Clear the singleton when the context closes
47
- ctx.on("close", () => {
48
- playwrightService = undefined;
49
- });
50
-
51
- return Object.freeze({ ctx, blocker });
52
  }
53
 
54
- export async function loadPage(url: string): Promise<{ res?: Response; page: Page }> {
55
- if (!playwrightService) playwrightService = initPlaywrightService();
56
- const { ctx, blocker } = await playwrightService;
 
 
57
 
58
- const page = await ctx.newPage();
59
- await blocker.enableBlockingInPage(page);
 
60
 
61
- const res = await page.goto(url, { waitUntil: "load", timeout: 3500 }).catch(() => {
62
- console.warn(`Failed to load page within 2s: ${url}`);
63
- });
64
 
65
- return { res: res ?? undefined, page };
 
 
 
 
66
  }
 
1
  import {
 
2
  chromium,
3
  devices,
4
  type Page,
5
  type BrowserContextOptions,
6
  type Response,
7
+ type Browser,
8
  } from "playwright";
9
  import { PlaywrightBlocker } from "@cliqz/adblocker-playwright";
10
  import { env } from "$env/dynamic/private";
11
+ import { logger } from "$lib/server/logger";
12
 
13
+ const blocker = await PlaywrightBlocker.fromPrebuiltAdsAndTracking(fetch).then((blker) => {
14
+ const mostBlocked = blker.blockFonts().blockMedias().blockFrames().blockImages();
15
+ if (env.WEBSEARCH_JAVASCRIPT === "false") return mostBlocked.blockScripts();
16
+ return mostBlocked;
17
+ });
18
 
19
+ let browserSingleton: Promise<Browser> | undefined;
20
+ async function getBrowser() {
21
  const browser = await chromium.launch({ headless: true });
 
22
  process.on("SIGINT", () => browser.close());
23
+ browser.on("disconnected", () => {
24
+ logger.warn("Browser closed");
25
+ browserSingleton = undefined;
26
+ });
27
+ return browser;
28
+ }
29
+
30
+ async function getPlaywrightCtx() {
31
+ if (!browserSingleton) browserSingleton = getBrowser();
32
+ const browser = await browserSingleton;
33
 
34
  const device = devices["Desktop Chrome"];
35
  const options: BrowserContextOptions = {
 
48
  timezoneId: "America/New_York",
49
  locale: "en-US",
50
  };
51
+ return browser.newContext(options);
 
 
 
 
 
 
 
 
 
 
 
 
52
  }
53
 
54
+ export async function withPage<T>(
55
+ url: string,
56
+ callback: (page: Page, response?: Response) => Promise<T>
57
+ ): Promise<T> {
58
+ const ctx = await getPlaywrightCtx();
59
 
60
+ try {
61
+ const page = await ctx.newPage();
62
+ await blocker.enableBlockingInPage(page);
63
 
64
+ const res = await page.goto(url, { waitUntil: "load", timeout: 3500 }).catch(() => {
65
+ console.warn(`Failed to load page within 2s: ${url}`);
66
+ });
67
 
68
+ // await needed here so that we don't close the context before the callback is done
69
+ return await callback(page, res ?? undefined);
70
+ } finally {
71
+ ctx.close();
72
+ }
73
  }
src/lib/server/websearch/scrape/scrape.ts CHANGED
@@ -1,6 +1,6 @@
1
  import type { WebSearchScrapedSource, WebSearchSource } from "$lib/types/WebSearch";
2
  import type { MessageWebSearchUpdate } from "$lib/types/MessageUpdate";
3
- import { loadPage } from "./playwright";
4
 
5
  import { spatialParser } from "./parser";
6
  import { htmlToMarkdownTree } from "../markdown/tree";
@@ -30,9 +30,7 @@ export const scrape = (maxCharsPerElem: number) =>
30
  };
31
 
32
  export async function scrapeUrl(url: string, maxCharsPerElem: number) {
33
- const { res, page } = await loadPage(url);
34
-
35
- try {
36
  if (!res) throw Error("Failed to load page");
37
 
38
  // Check if it's a non-html content type that we can handle directly
@@ -66,7 +64,5 @@ export async function scrapeUrl(url: string, maxCharsPerElem: number) {
66
  throw Error("Parsing failed", { cause });
67
  });
68
  return scrapedOutput;
69
- } finally {
70
- page.close();
71
- }
72
  }
 
1
  import type { WebSearchScrapedSource, WebSearchSource } from "$lib/types/WebSearch";
2
  import type { MessageWebSearchUpdate } from "$lib/types/MessageUpdate";
3
+ import { withPage } from "./playwright";
4
 
5
  import { spatialParser } from "./parser";
6
  import { htmlToMarkdownTree } from "../markdown/tree";
 
30
  };
31
 
32
  export async function scrapeUrl(url: string, maxCharsPerElem: number) {
33
+ return withPage(url, async (page, res) => {
 
 
34
  if (!res) throw Error("Failed to load page");
35
 
36
  // Check if it's a non-html content type that we can handle directly
 
64
  throw Error("Parsing failed", { cause });
65
  });
66
  return scrapedOutput;
67
+ });
 
 
68
  }