File size: 6,534 Bytes
62da328
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import re
from typing import Any, Dict, List, Optional

from camel.toolkits import FunctionTool
from camel.toolkits.base import BaseToolkit


class GoogleScholarToolkit(BaseToolkit):
    r"""A toolkit for retrieving information about authors and their
    publications from Google Scholar.

    Attributes:
        author_identifier (Union[str, None]): The author's Google Scholar URL
            or name of the author to search for.
        is_author_name (bool): Flag to indicate if the identifier is a name.
            (default: :obj:`False`)
        scholarly (module): The scholarly module for querying Google Scholar.
        author (Optional[Dict[str, Any]]): Cached author details, allowing
            manual assignment if desired.
    """

    def __init__(
        self, author_identifier: str, is_author_name: bool = False
    ) -> None:
        r"""Initializes the GoogleScholarToolkit with the author's identifier.

        Args:
            author_identifier (str): The author's Google Scholar URL or name
                of the author to search for.
            is_author_name (bool): Flag to indicate if the identifier is a
                name. (default: :obj:`False`)
        """
        from scholarly import scholarly

        self.scholarly = scholarly
        self.author_identifier = author_identifier
        self.is_author_name = is_author_name
        self._author: Optional[Dict[str, Any]] = None

    @property
    def author(self) -> Dict[str, Any]:
        r"""Getter for the author attribute, fetching details if not cached.

        Returns:
            Dict[str, Any]: A dictionary containing author details. If no data
                is available, returns an empty dictionary.
        """
        if self._author is None:
            self.get_author_detailed_info()
        return self._author or {}

    @author.setter
    def author(self, value: Optional[Dict[str, Any]]) -> None:
        r"""Sets or overrides the cached author information.

        Args:
            value (Optional[Dict[str, Any]]): A dictionary containing author
                details to cache or `None` to clear the cached data.

        Raises:
            ValueError: If `value` is not a dictionary or `None`.
        """
        if value is None or isinstance(value, dict):
            self._author = value
        else:
            raise ValueError("Author must be a dictionary or None.")

    def _extract_author_id(self) -> Optional[str]:
        r"""Extracts the author ID from a Google Scholar URL if provided.

        Returns:
            Optional[str]: The extracted author ID, or None if not found.
        """
        match = re.search(r'user=([A-Za-z0-9-]+)', self.author_identifier)
        return match.group(1) if match else None

    def get_author_detailed_info(
        self,
    ) -> dict:
        r"""Retrieves detailed information about the author.

        Returns:
            dict: A dictionary containing detailed information about the
                author.
        """
        if self.is_author_name:
            search_query = self.scholarly.search_author(self.author_identifier)
            # Retrieve the first result from the iterator
            first_author_result = next(search_query)
        else:
            author_id = self._extract_author_id()
            first_author_result = self.scholarly.search_author_id(id=author_id)

        self._author = self.scholarly.fill(first_author_result)
        return self._author  # type: ignore[return-value]

    def get_author_publications(
        self,
    ) -> List[str]:
        r"""Retrieves the titles of the author's publications.

        Returns:
            List[str]: A list of publication titles authored by the author.
        """
        publication_titles = [
            pub['bib']['title'] for pub in self.author['publications']
        ]
        return publication_titles

    def get_publication_by_title(
        self, publication_title: str
    ) -> Optional[dict]:
        r"""Retrieves detailed information about a specific publication by its
        title. Note that this method cannot retrieve the full content of the
        paper.

        Args:
            publication_title (str): The title of the publication to search
                for.

        Returns:
            Optional[dict]: A dictionary containing detailed information about
                the publication if found; otherwise, `None`.
        """
        publications = self.author['publications']
        for publication in publications:
            if publication['bib']['title'] == publication_title:
                return self.scholarly.fill(publication)
        return None  # Return None if not found

    def get_full_paper_content_by_link(self, pdf_url: str) -> Optional[str]:
        r"""Retrieves the full paper content from a given PDF URL using the
        arxiv2text tool.

        Args:
            pdf_url (str): The URL of the PDF file.

        Returns:
            Optional[str]: The full text extracted from the PDF, or `None` if
                an error occurs.
        """
        from arxiv2text import arxiv_to_text

        try:
            return arxiv_to_text(pdf_url)
        except Exception:
            return None  # Return None in case of any error

    def get_tools(self) -> List[FunctionTool]:
        r"""Returns a list of FunctionTool objects representing the
        functions in the toolkit.

        Returns:
            List[FunctionTool]: A list of FunctionTool objects
                representing the functions in the toolkit.
        """
        return [
            FunctionTool(self.get_author_detailed_info),
            FunctionTool(self.get_author_publications),
            FunctionTool(self.get_publication_by_title),
            FunctionTool(self.get_full_paper_content_by_link),
        ]