|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
from typing import Any, Optional |
|
from warnings import warn |
|
|
|
from camel.types.enums import JinaReturnFormat |
|
|
|
JINA_ENDPOINT = "https://r.jina.ai/" |
|
|
|
|
|
class JinaURLReader: |
|
r"""URL Reader provided by Jina AI. The output is cleaner and more |
|
LLM-friendly than the URL Reader of UnstructuredIO. Can be configured to |
|
replace the UnstructuredIO URL Reader in the pipeline. |
|
|
|
Args: |
|
api_key (Optional[str], optional): The API key for Jina AI. If not |
|
provided, the reader will have a lower rate limit. Defaults to |
|
None. |
|
return_format (ReturnFormat, optional): The level of detail |
|
of the returned content, which is optimized for LLMs. For |
|
now screenshots are not supported. Defaults to |
|
ReturnFormat.DEFAULT. |
|
json_response (bool, optional): Whether to return the response |
|
in JSON format. Defaults to False. |
|
timeout (int, optional): The maximum time in seconds to wait for |
|
the page to be rendered. Defaults to 30. |
|
**kwargs (Any): Additional keyword arguments, including proxies, |
|
cookies, etc. It should align with the HTTP Header field and |
|
value pairs listed in the reference. |
|
|
|
References: |
|
https://jina.ai/reader |
|
""" |
|
|
|
def __init__( |
|
self, |
|
api_key: Optional[str] = None, |
|
return_format: JinaReturnFormat = JinaReturnFormat.DEFAULT, |
|
json_response: bool = False, |
|
timeout: int = 30, |
|
**kwargs: Any, |
|
) -> None: |
|
api_key = api_key or os.getenv('JINA_API_KEY') |
|
if not api_key: |
|
warn( |
|
"JINA_API_KEY not set. This will result in a low rate limit " |
|
"of Jina URL Reader. Get API key here: https://jina.ai/reader." |
|
) |
|
|
|
|
|
api_field = f"Bearer {api_key}" if api_key else None |
|
json_field = "application/json" if json_response else None |
|
|
|
raw_headers = { |
|
"Authorization": api_field, |
|
"X-Return-Format": return_format.value, |
|
"Accept": json_field, |
|
"X-Timeout": str(timeout), |
|
**kwargs, |
|
} |
|
|
|
|
|
self._headers = {k: v for k, v in raw_headers.items() if v} |
|
|
|
def read_content(self, url: str) -> str: |
|
r"""Reads the content of a URL and returns it as a string with |
|
given form. |
|
|
|
Args: |
|
url (str): The URL to read. |
|
|
|
Returns: |
|
str: The content of the URL. |
|
""" |
|
|
|
import requests |
|
|
|
full_url = f"{JINA_ENDPOINT}{url}" |
|
try: |
|
resp = requests.get(full_url, headers=self._headers) |
|
resp.raise_for_status() |
|
except Exception as e: |
|
raise ValueError(f"Failed to read content from {url}: {e}") from e |
|
|
|
return resp.text |
|
|