|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
from typing import Any, Dict, Optional |
|
|
|
from pydantic import BaseModel |
|
|
|
|
|
class Firecrawl: |
|
r"""Firecrawl allows you to turn entire websites into LLM-ready markdown. |
|
|
|
Args: |
|
api_key (Optional[str]): API key for authenticating with the Firecrawl |
|
API. |
|
api_url (Optional[str]): Base URL for the Firecrawl API. |
|
|
|
References: |
|
https://docs.firecrawl.dev/introduction |
|
""" |
|
|
|
def __init__( |
|
self, |
|
api_key: Optional[str] = None, |
|
api_url: Optional[str] = None, |
|
) -> None: |
|
from firecrawl import FirecrawlApp |
|
|
|
self._api_key = api_key or os.environ.get("FIRECRAWL_API_KEY") |
|
self._api_url = api_url or os.environ.get("FIRECRAWL_API_URL") |
|
|
|
self.app = FirecrawlApp(api_key=self._api_key, api_url=self._api_url) |
|
|
|
def crawl( |
|
self, |
|
url: str, |
|
params: Optional[Dict[str, Any]] = None, |
|
**kwargs: Any, |
|
) -> Any: |
|
r"""Crawl a URL and all accessible subpages. Customize the crawl by |
|
setting different parameters, and receive the full response or a job |
|
ID based on the specified options. |
|
|
|
Args: |
|
url (str): The URL to crawl. |
|
params (Optional[Dict[str, Any]]): Additional parameters for the |
|
crawl request. Defaults to `None`. |
|
**kwargs (Any): Additional keyword arguments, such as |
|
`poll_interval`, `idempotency_key`. |
|
|
|
Returns: |
|
Any: The crawl job ID or the crawl results if waiting until |
|
completion. |
|
|
|
Raises: |
|
RuntimeError: If the crawling process fails. |
|
""" |
|
|
|
try: |
|
crawl_response = self.app.crawl_url( |
|
url=url, |
|
params=params, |
|
**kwargs, |
|
) |
|
return crawl_response |
|
except Exception as e: |
|
raise RuntimeError(f"Failed to crawl the URL: {e}") |
|
|
|
def markdown_crawl(self, url: str) -> str: |
|
r"""Crawl a URL and all accessible subpages and return the content in |
|
Markdown format. |
|
|
|
Args: |
|
url (str): The URL to crawl. |
|
|
|
Returns: |
|
str: The content of the URL in Markdown format. |
|
|
|
Raises: |
|
RuntimeError: If the crawling process fails. |
|
""" |
|
|
|
try: |
|
crawl_result = self.app.crawl_url( |
|
url, |
|
{'formats': ['markdown']}, |
|
) |
|
if not isinstance(crawl_result, list): |
|
raise ValueError("Unexpected response format") |
|
markdown_contents = [ |
|
result.get('markdown', '') for result in crawl_result |
|
] |
|
return '\n'.join(markdown_contents) |
|
except Exception as e: |
|
raise RuntimeError( |
|
f"Failed to crawl the URL and retrieve markdown: {e}" |
|
) |
|
|
|
def check_crawl_job(self, job_id: str) -> Dict: |
|
r"""Check the status of a crawl job. |
|
|
|
Args: |
|
job_id (str): The ID of the crawl job. |
|
|
|
Returns: |
|
Dict: The response including status of the crawl job. |
|
|
|
Raises: |
|
RuntimeError: If the check process fails. |
|
""" |
|
|
|
try: |
|
return self.app.check_crawl_status(job_id) |
|
except Exception as e: |
|
raise RuntimeError(f"Failed to check the crawl job status: {e}") |
|
|
|
def scrape( |
|
self, |
|
url: str, |
|
params: Optional[Dict[str, Any]] = None, |
|
) -> Dict: |
|
r"""To scrape a single URL. This function supports advanced scraping |
|
by setting different parameters and returns the full scraped data as a |
|
dictionary. |
|
|
|
Reference: https://docs.firecrawl.dev/advanced-scraping-guide |
|
|
|
Args: |
|
url (str): The URL to read. |
|
params (Optional[Dict[str, Any]]): Additional parameters for the |
|
scrape request. |
|
|
|
Returns: |
|
Dict: The scraped data. |
|
|
|
Raises: |
|
RuntimeError: If the scrape process fails. |
|
""" |
|
try: |
|
return self.app.scrape_url(url=url, params=params) |
|
except Exception as e: |
|
raise RuntimeError(f"Failed to scrape the URL: {e}") |
|
|
|
def structured_scrape(self, url: str, response_format: BaseModel) -> Dict: |
|
r"""Use LLM to extract structured data from given URL. |
|
|
|
Args: |
|
url (str): The URL to read. |
|
response_format (BaseModel): A pydantic model |
|
that includes value types and field descriptions used to |
|
generate a structured response by LLM. This schema helps |
|
in defining the expected output format. |
|
|
|
Returns: |
|
Dict: The content of the URL. |
|
|
|
Raises: |
|
RuntimeError: If the scrape process fails. |
|
""" |
|
try: |
|
data = self.app.scrape_url( |
|
url, |
|
{ |
|
'formats': ['extract'], |
|
'extract': {'schema': response_format.model_json_schema()}, |
|
}, |
|
) |
|
return data.get("extract", {}) |
|
except Exception as e: |
|
raise RuntimeError(f"Failed to perform structured scrape: {e}") |
|
|
|
def map_site( |
|
self, url: str, params: Optional[Dict[str, Any]] = None |
|
) -> list: |
|
r"""Map a website to retrieve all accessible URLs. |
|
|
|
Args: |
|
url (str): The URL of the site to map. |
|
params (Optional[Dict[str, Any]]): Additional parameters for the |
|
map request. Defaults to `None`. |
|
|
|
Returns: |
|
list: A list containing the URLs found on the site. |
|
|
|
Raises: |
|
RuntimeError: If the mapping process fails. |
|
""" |
|
try: |
|
return self.app.map_url(url=url, params=params) |
|
except Exception as e: |
|
raise RuntimeError(f"Failed to map the site: {e}") |
|
|