Shreyas094's picture
Upload 528 files
372531f verified
raw
history blame contribute delete
4.81 kB
from concurrent.futures.thread import ThreadPoolExecutor
from functools import partial
from colorama import Fore, init
import requests
import subprocess
import sys
import importlib
from . import (
ArxivScraper,
BeautifulSoupScraper,
PyMuPDFScraper,
WebBaseLoaderScraper,
BrowserScraper,
TavilyExtract
)
class Scraper:
"""
Scraper class to extract the content from the links
"""
def __init__(self, urls, user_agent, scraper):
"""
Initialize the Scraper class.
Args:
urls:
"""
self.urls = urls
self.session = requests.Session()
self.session.headers.update({"User-Agent": user_agent})
self.scraper = scraper
if self.scraper == "tavily_extract":
self._check_pkg(self.scraper)
def run(self):
"""
Extracts the content from the links
"""
partial_extract = partial(self.extract_data_from_url, session=self.session)
with ThreadPoolExecutor(max_workers=20) as executor:
contents = executor.map(partial_extract, self.urls)
res = [content for content in contents if content["raw_content"] is not None]
return res
def _check_pkg(self, scrapper_name : str) -> None:
"""
Checks and ensures required Python packages are available for scrapers that need
dependencies beyond requirements.txt. When adding a new scraper to the repo, update `pkg_map`
with its required information and call check_pkg() during initialization.
"""
pkg_map = {
"tavily_extract": {"package_installation_name": "tavily-python",
"import_name": "tavily"},
}
pkg = pkg_map[scrapper_name]
if not importlib.util.find_spec(pkg["import_name"]):
pkg_inst_name = pkg["package_installation_name"]
init(autoreset=True)
print(Fore.YELLOW + f"{pkg_inst_name} not found. Attempting to install...")
try:
subprocess.check_call([sys.executable, "-m", "pip", "install", pkg_inst_name])
print(Fore.GREEN + f"{pkg_inst_name} installed successfully.")
except subprocess.CalledProcessError:
raise ImportError(
Fore.RED + f"Unable to install {pkg_inst_name}. Please install manually with "
f"`pip install -U {pkg_inst_name}`"
)
def extract_data_from_url(self, link, session):
"""
Extracts the data from the link
"""
try:
Scraper = self.get_scraper(link)
scraper = Scraper(link, session)
content, image_urls, title = scraper.scrape()
if len(content) < 100:
return {"url": link, "raw_content": None, "image_urls": [], "title": ""}
return {"url": link, "raw_content": content, "image_urls": image_urls, "title": title}
except Exception as e:
return {"url": link, "raw_content": None, "image_urls": [], "title": ""}
def get_scraper(self, link):
"""
The function `get_scraper` determines the appropriate scraper class based on the provided link
or a default scraper if none matches.
Args:
link: The `get_scraper` method takes a `link` parameter which is a URL link to a webpage or a
PDF file. Based on the type of content the link points to, the method determines the appropriate
scraper class to use for extracting data from that content.
Returns:
The `get_scraper` method returns the scraper class based on the provided link. The method
checks the link to determine the appropriate scraper class to use based on predefined mappings
in the `SCRAPER_CLASSES` dictionary. If the link ends with ".pdf", it selects the
`PyMuPDFScraper` class. If the link contains "arxiv.org", it selects the `ArxivScraper
"""
SCRAPER_CLASSES = {
"pdf": PyMuPDFScraper,
"arxiv": ArxivScraper,
"bs": BeautifulSoupScraper,
"web_base_loader": WebBaseLoaderScraper,
"browser": BrowserScraper,
"tavily_extract": TavilyExtract
}
scraper_key = None
if link.endswith(".pdf"):
scraper_key = "pdf"
elif "arxiv.org" in link:
scraper_key = "arxiv"
else:
scraper_key = self.scraper
scraper_class = SCRAPER_CLASSES.get(scraper_key)
if scraper_class is None:
raise Exception("Scraper not found.")
return scraper_class