File size: 1,540 Bytes
ed4d993
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from typing import Iterator, Optional, Sequence

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader


class BrowserbaseLoader(BaseLoader):
    """Load pre-rendered web pages using a headless browser hosted on Browserbase.

    Depends on `browserbase` package.
    Get your API key from https://browserbase.com
    """

    def __init__(
        self,
        urls: Sequence[str],
        text_content: bool = False,
        api_key: Optional[str] = None,
        project_id: Optional[str] = None,
        session_id: Optional[str] = None,
        proxy: Optional[bool] = None,
    ):
        self.urls = urls
        self.text_content = text_content
        self.session_id = session_id
        self.proxy = proxy

        try:
            from browserbase import Browserbase
        except ImportError:
            raise ImportError(
                "You must run "
                "`pip install --upgrade "
                "browserbase` "
                "to use the Browserbase loader."
            )

        self.browserbase = Browserbase(api_key, project_id)

    def lazy_load(self) -> Iterator[Document]:
        """Load pages from URLs"""
        pages = self.browserbase.load_urls(
            self.urls, self.text_content, self.session_id, self.proxy
        )

        for i, page in enumerate(pages):
            yield Document(
                page_content=page,
                metadata={
                    "url": self.urls[i],
                },
            )