File size: 4,808 Bytes
372531f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from concurrent.futures.thread import ThreadPoolExecutor
from functools import partial
from colorama import Fore, init

import requests
import subprocess
import sys
import importlib

from . import (
    ArxivScraper,
    BeautifulSoupScraper,
    PyMuPDFScraper,
    WebBaseLoaderScraper,
    BrowserScraper,
    TavilyExtract
)


class Scraper:
    """

    Scraper class to extract the content from the links

    """

    def __init__(self, urls, user_agent, scraper):
        """

        Initialize the Scraper class.

        Args:

            urls:

        """
        self.urls = urls
        self.session = requests.Session()
        self.session.headers.update({"User-Agent": user_agent})
        self.scraper = scraper
        if self.scraper == "tavily_extract":
            self._check_pkg(self.scraper)

    def run(self):
        """

        Extracts the content from the links

        """
        partial_extract = partial(self.extract_data_from_url, session=self.session)
        with ThreadPoolExecutor(max_workers=20) as executor:
            contents = executor.map(partial_extract, self.urls)
        res = [content for content in contents if content["raw_content"] is not None]
        return res

    def _check_pkg(self, scrapper_name : str) -> None:
        """

        Checks and ensures required Python packages are available for scrapers that need

        dependencies beyond requirements.txt. When adding a new scraper to the repo, update `pkg_map`

        with its required information and call check_pkg() during initialization.

        """
        pkg_map = {
            "tavily_extract": {"package_installation_name": "tavily-python",
                               "import_name": "tavily"},
        }
        pkg = pkg_map[scrapper_name]
        if not importlib.util.find_spec(pkg["import_name"]):
            pkg_inst_name = pkg["package_installation_name"]
            init(autoreset=True)
            print(Fore.YELLOW + f"{pkg_inst_name} not found. Attempting to install...")
            try:
                subprocess.check_call([sys.executable, "-m", "pip", "install", pkg_inst_name])
                print(Fore.GREEN + f"{pkg_inst_name} installed successfully.")
            except subprocess.CalledProcessError:
                raise ImportError(
                    Fore.RED + f"Unable to install {pkg_inst_name}. Please install manually with "
                               f"`pip install -U {pkg_inst_name}`"
                )

    def extract_data_from_url(self, link, session):
        """

        Extracts the data from the link

        """
        try:
            Scraper = self.get_scraper(link)
            scraper = Scraper(link, session)
            content, image_urls, title = scraper.scrape()

            if len(content) < 100:
                return {"url": link, "raw_content": None, "image_urls": [], "title": ""}
            
            return {"url": link, "raw_content": content, "image_urls": image_urls, "title": title}
        except Exception as e:
            return {"url": link, "raw_content": None, "image_urls": [], "title": ""}

    def get_scraper(self, link):
        """

        The function `get_scraper` determines the appropriate scraper class based on the provided link

        or a default scraper if none matches.



        Args:

          link: The `get_scraper` method takes a `link` parameter which is a URL link to a webpage or a

        PDF file. Based on the type of content the link points to, the method determines the appropriate

        scraper class to use for extracting data from that content.



        Returns:

          The `get_scraper` method returns the scraper class based on the provided link. The method

        checks the link to determine the appropriate scraper class to use based on predefined mappings

        in the `SCRAPER_CLASSES` dictionary. If the link ends with ".pdf", it selects the

        `PyMuPDFScraper` class. If the link contains "arxiv.org", it selects the `ArxivScraper

        """

        SCRAPER_CLASSES = {
            "pdf": PyMuPDFScraper,
            "arxiv": ArxivScraper,
            "bs": BeautifulSoupScraper,
            "web_base_loader": WebBaseLoaderScraper,
            "browser": BrowserScraper,
            "tavily_extract": TavilyExtract
        }

        scraper_key = None

        if link.endswith(".pdf"):
            scraper_key = "pdf"
        elif "arxiv.org" in link:
            scraper_key = "arxiv"
        else:
            scraper_key = self.scraper

        scraper_class = SCRAPER_CLASSES.get(scraper_key)
        if scraper_class is None:
            raise Exception("Scraper not found.")

        return scraper_class