Spaces:

rote1
/

IAGO

Running

App Files Files Community

IAGO / deep-swarm /camel /loaders /firecrawl_reader.py

zyh-ralph

initial update

62da328 3 months ago

raw

history blame

6.6 kB

	# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

	import os
	from typing import Any, Dict, Optional

	from pydantic import BaseModel


	class Firecrawl:
	r"""Firecrawl allows you to turn entire websites into LLM-ready markdown.

	Args:
	api_key (Optional[str]): API key for authenticating with the Firecrawl
	API.
	api_url (Optional[str]): Base URL for the Firecrawl API.

	References:
	https://docs.firecrawl.dev/introduction
	"""

	def __init__(
	self,
	api_key: Optional[str] = None,
	api_url: Optional[str] = None,
	) -> None:
	from firecrawl import FirecrawlApp

	self._api_key = api_key or os.environ.get("FIRECRAWL_API_KEY")
	self._api_url = api_url or os.environ.get("FIRECRAWL_API_URL")

	self.app = FirecrawlApp(api_key=self._api_key, api_url=self._api_url)

	def crawl(
	self,
	url: str,
	params: Optional[Dict[str, Any]] = None,
	**kwargs: Any,
	) -> Any:
	r"""Crawl a URL and all accessible subpages. Customize the crawl by
	setting different parameters, and receive the full response or a job
	ID based on the specified options.

	Args:
	url (str): The URL to crawl.
	params (Optional[Dict[str, Any]]): Additional parameters for the
	crawl request. Defaults to `None`.
	**kwargs (Any): Additional keyword arguments, such as
	`poll_interval`, `idempotency_key`.

	Returns:
	Any: The crawl job ID or the crawl results if waiting until
	completion.

	Raises:
	RuntimeError: If the crawling process fails.
	"""

	try:
	crawl_response = self.app.crawl_url(
	url=url,
	params=params,
	**kwargs,
	)
	return crawl_response
	except Exception as e:
	raise RuntimeError(f"Failed to crawl the URL: {e}")

	def markdown_crawl(self, url: str) -> str:
	r"""Crawl a URL and all accessible subpages and return the content in
	Markdown format.

	Args:
	url (str): The URL to crawl.

	Returns:
	str: The content of the URL in Markdown format.

	Raises:
	RuntimeError: If the crawling process fails.
	"""

	try:
	crawl_result = self.app.crawl_url(
	url,
	{'formats': ['markdown']},
	)
	if not isinstance(crawl_result, list):
	raise ValueError("Unexpected response format")
	markdown_contents = [
	result.get('markdown', '') for result in crawl_result
	]
	return '\n'.join(markdown_contents)
	except Exception as e:
	raise RuntimeError(
	f"Failed to crawl the URL and retrieve markdown: {e}"
	)

	def check_crawl_job(self, job_id: str) -> Dict:
	r"""Check the status of a crawl job.

	Args:
	job_id (str): The ID of the crawl job.

	Returns:
	Dict: The response including status of the crawl job.

	Raises:
	RuntimeError: If the check process fails.
	"""

	try:
	return self.app.check_crawl_status(job_id)
	except Exception as e:
	raise RuntimeError(f"Failed to check the crawl job status: {e}")

	def scrape(
	self,
	url: str,
	params: Optional[Dict[str, Any]] = None,
	) -> Dict:
	r"""To scrape a single URL. This function supports advanced scraping
	by setting different parameters and returns the full scraped data as a
	dictionary.

	Reference: https://docs.firecrawl.dev/advanced-scraping-guide

	Args:
	url (str): The URL to read.
	params (Optional[Dict[str, Any]]): Additional parameters for the
	scrape request.

	Returns:
	Dict: The scraped data.

	Raises:
	RuntimeError: If the scrape process fails.
	"""
	try:
	return self.app.scrape_url(url=url, params=params)
	except Exception as e:
	raise RuntimeError(f"Failed to scrape the URL: {e}")

	def structured_scrape(self, url: str, response_format: BaseModel) -> Dict:
	r"""Use LLM to extract structured data from given URL.

	Args:
	url (str): The URL to read.
	response_format (BaseModel): A pydantic model
	that includes value types and field descriptions used to
	generate a structured response by LLM. This schema helps
	in defining the expected output format.

	Returns:
	Dict: The content of the URL.

	Raises:
	RuntimeError: If the scrape process fails.
	"""
	try:
	data = self.app.scrape_url(
	url,
	{
	'formats': ['extract'],
	'extract': {'schema': response_format.model_json_schema()},
	},
	)
	return data.get("extract", {})
	except Exception as e:
	raise RuntimeError(f"Failed to perform structured scrape: {e}")

	def map_site(
	self, url: str, params: Optional[Dict[str, Any]] = None
	) -> list:
	r"""Map a website to retrieve all accessible URLs.

	Args:
	url (str): The URL of the site to map.
	params (Optional[Dict[str, Any]]): Additional parameters for the
	map request. Defaults to `None`.

	Returns:
	list: A list containing the URLs found on the site.

	Raises:
	RuntimeError: If the mapping process fails.
	"""
	try:
	return self.app.map_url(url=url, params=params)
	except Exception as e:
	raise RuntimeError(f"Failed to map the site: {e}")