Liam Dyer nsarrazin HF Staff commited on
Commit
853f8fc
·
unverified ·
1 Parent(s): 2a6e267

Always close playwright page (#1171)

Browse files

fix: always close playwright page

Co-authored-by: Nathan Sarrazin <[email protected]>

src/lib/server/websearch/scrape/scrape.ts CHANGED
@@ -24,37 +24,41 @@ export const scrape = (maxCharsPerElem: number) =>
24
  export async function scrapeUrl(url: string, maxCharsPerElem: number) {
25
  const { res, page } = await loadPage(url);
26
 
27
- if (!res) throw Error("Failed to load page");
 
28
 
29
- // Check if it's a non-html content type that we can handle directly
30
- // TODO: direct mappings to markdown can be added for markdown, csv and others
31
- const contentType = res.headers()["content-type"] ?? "";
32
- if (
33
- contentType.includes("text/plain") ||
34
- contentType.includes("text/markdown") ||
35
- contentType.includes("application/json") ||
36
- contentType.includes("application/xml") ||
37
- contentType.includes("text/csv")
38
- ) {
39
- const title = await page.title();
40
- const content = await page.content();
41
- return {
42
- title,
43
- markdownTree: htmlToMarkdownTree(
44
  title,
45
- [{ tagName: "p", attributes: {}, content: [content] }],
46
- maxCharsPerElem
47
- ),
48
- };
49
- }
 
 
50
 
51
- return timeout(page.evaluate(spatialParser), 2000)
52
- .then(({ elements, ...parsed }) => ({
53
- ...parsed,
54
- markdownTree: htmlToMarkdownTree(parsed.title, elements, maxCharsPerElem),
55
- }))
56
- .catch((cause) => {
57
- throw Error("Parsing failed", { cause });
58
- })
59
- .finally(() => page.close());
 
 
 
60
  }
 
24
  export async function scrapeUrl(url: string, maxCharsPerElem: number) {
25
  const { res, page } = await loadPage(url);
26
 
27
+ try {
28
+ if (!res) throw Error("Failed to load page");
29
 
30
+ // Check if it's a non-html content type that we can handle directly
31
+ // TODO: direct mappings to markdown can be added for markdown, csv and others
32
+ const contentType = res.headers()["content-type"] ?? "";
33
+ if (
34
+ contentType.includes("text/plain") ||
35
+ contentType.includes("text/markdown") ||
36
+ contentType.includes("application/json") ||
37
+ contentType.includes("application/xml") ||
38
+ contentType.includes("text/csv")
39
+ ) {
40
+ const title = await page.title();
41
+ const content = await page.content();
42
+ return {
 
 
43
  title,
44
+ markdownTree: htmlToMarkdownTree(
45
+ title,
46
+ [{ tagName: "p", attributes: {}, content: [content] }],
47
+ maxCharsPerElem
48
+ ),
49
+ };
50
+ }
51
 
52
+ const scrapedOutput = await timeout(page.evaluate(spatialParser), 2000)
53
+ .then(({ elements, ...parsed }) => ({
54
+ ...parsed,
55
+ markdownTree: htmlToMarkdownTree(parsed.title, elements, maxCharsPerElem),
56
+ }))
57
+ .catch((cause) => {
58
+ throw Error("Parsing failed", { cause });
59
+ });
60
+ return scrapedOutput;
61
+ } finally {
62
+ page.close();
63
+ }
64
  }