File size: 3,601 Bytes
62da328
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

import os
from typing import Any, Optional
from warnings import warn

from camel.types.enums import JinaReturnFormat

JINA_ENDPOINT = "https://r.jina.ai/"


class JinaURLReader:
    r"""URL Reader provided by Jina AI. The output is cleaner and more
    LLM-friendly than the URL Reader of UnstructuredIO. Can be configured to
    replace the UnstructuredIO URL Reader in the pipeline.

    Args:
        api_key (Optional[str], optional): The API key for Jina AI. If not
            provided, the reader will have a lower rate limit. Defaults to
            None.
        return_format (ReturnFormat, optional): The level of detail
            of the returned content, which is optimized for LLMs. For
            now screenshots are not supported. Defaults to
            ReturnFormat.DEFAULT.
        json_response (bool, optional): Whether to return the response
            in JSON format. Defaults to False.
        timeout (int, optional): The maximum time in seconds to wait for
            the page to be rendered. Defaults to 30.
        **kwargs (Any): Additional keyword arguments, including proxies,
            cookies, etc. It should align with the HTTP Header field and
            value pairs listed in the reference.

    References:
        https://jina.ai/reader
    """

    def __init__(
        self,
        api_key: Optional[str] = None,
        return_format: JinaReturnFormat = JinaReturnFormat.DEFAULT,
        json_response: bool = False,
        timeout: int = 30,
        **kwargs: Any,
    ) -> None:
        api_key = api_key or os.getenv('JINA_API_KEY')
        if not api_key:
            warn(
                "JINA_API_KEY not set. This will result in a low rate limit "
                "of Jina URL Reader. Get API key here: https://jina.ai/reader."
            )

        # if the following field not provided, it will be None
        api_field = f"Bearer {api_key}" if api_key else None
        json_field = "application/json" if json_response else None

        raw_headers = {
            "Authorization": api_field,
            "X-Return-Format": return_format.value,
            "Accept": json_field,
            "X-Timeout": str(timeout),
            **kwargs,
        }

        # eliminate None values
        self._headers = {k: v for k, v in raw_headers.items() if v}

    def read_content(self, url: str) -> str:
        r"""Reads the content of a URL and returns it as a string with
        given form.

        Args:
            url (str): The URL to read.

        Returns:
            str: The content of the URL.
        """

        import requests

        full_url = f"{JINA_ENDPOINT}{url}"
        try:
            resp = requests.get(full_url, headers=self._headers)
            resp.raise_for_status()
        except Exception as e:
            raise ValueError(f"Failed to read content from {url}: {e}") from e

        return resp.text