File size: 6,534 Bytes
62da328 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import re
from typing import Any, Dict, List, Optional
from camel.toolkits import FunctionTool
from camel.toolkits.base import BaseToolkit
class GoogleScholarToolkit(BaseToolkit):
r"""A toolkit for retrieving information about authors and their
publications from Google Scholar.
Attributes:
author_identifier (Union[str, None]): The author's Google Scholar URL
or name of the author to search for.
is_author_name (bool): Flag to indicate if the identifier is a name.
(default: :obj:`False`)
scholarly (module): The scholarly module for querying Google Scholar.
author (Optional[Dict[str, Any]]): Cached author details, allowing
manual assignment if desired.
"""
def __init__(
self, author_identifier: str, is_author_name: bool = False
) -> None:
r"""Initializes the GoogleScholarToolkit with the author's identifier.
Args:
author_identifier (str): The author's Google Scholar URL or name
of the author to search for.
is_author_name (bool): Flag to indicate if the identifier is a
name. (default: :obj:`False`)
"""
from scholarly import scholarly
self.scholarly = scholarly
self.author_identifier = author_identifier
self.is_author_name = is_author_name
self._author: Optional[Dict[str, Any]] = None
@property
def author(self) -> Dict[str, Any]:
r"""Getter for the author attribute, fetching details if not cached.
Returns:
Dict[str, Any]: A dictionary containing author details. If no data
is available, returns an empty dictionary.
"""
if self._author is None:
self.get_author_detailed_info()
return self._author or {}
@author.setter
def author(self, value: Optional[Dict[str, Any]]) -> None:
r"""Sets or overrides the cached author information.
Args:
value (Optional[Dict[str, Any]]): A dictionary containing author
details to cache or `None` to clear the cached data.
Raises:
ValueError: If `value` is not a dictionary or `None`.
"""
if value is None or isinstance(value, dict):
self._author = value
else:
raise ValueError("Author must be a dictionary or None.")
def _extract_author_id(self) -> Optional[str]:
r"""Extracts the author ID from a Google Scholar URL if provided.
Returns:
Optional[str]: The extracted author ID, or None if not found.
"""
match = re.search(r'user=([A-Za-z0-9-]+)', self.author_identifier)
return match.group(1) if match else None
def get_author_detailed_info(
self,
) -> dict:
r"""Retrieves detailed information about the author.
Returns:
dict: A dictionary containing detailed information about the
author.
"""
if self.is_author_name:
search_query = self.scholarly.search_author(self.author_identifier)
# Retrieve the first result from the iterator
first_author_result = next(search_query)
else:
author_id = self._extract_author_id()
first_author_result = self.scholarly.search_author_id(id=author_id)
self._author = self.scholarly.fill(first_author_result)
return self._author # type: ignore[return-value]
def get_author_publications(
self,
) -> List[str]:
r"""Retrieves the titles of the author's publications.
Returns:
List[str]: A list of publication titles authored by the author.
"""
publication_titles = [
pub['bib']['title'] for pub in self.author['publications']
]
return publication_titles
def get_publication_by_title(
self, publication_title: str
) -> Optional[dict]:
r"""Retrieves detailed information about a specific publication by its
title. Note that this method cannot retrieve the full content of the
paper.
Args:
publication_title (str): The title of the publication to search
for.
Returns:
Optional[dict]: A dictionary containing detailed information about
the publication if found; otherwise, `None`.
"""
publications = self.author['publications']
for publication in publications:
if publication['bib']['title'] == publication_title:
return self.scholarly.fill(publication)
return None # Return None if not found
def get_full_paper_content_by_link(self, pdf_url: str) -> Optional[str]:
r"""Retrieves the full paper content from a given PDF URL using the
arxiv2text tool.
Args:
pdf_url (str): The URL of the PDF file.
Returns:
Optional[str]: The full text extracted from the PDF, or `None` if
an error occurs.
"""
from arxiv2text import arxiv_to_text
try:
return arxiv_to_text(pdf_url)
except Exception:
return None # Return None in case of any error
def get_tools(self) -> List[FunctionTool]:
r"""Returns a list of FunctionTool objects representing the
functions in the toolkit.
Returns:
List[FunctionTool]: A list of FunctionTool objects
representing the functions in the toolkit.
"""
return [
FunctionTool(self.get_author_detailed_info),
FunctionTool(self.get_author_publications),
FunctionTool(self.get_publication_by_title),
FunctionTool(self.get_full_paper_content_by_link),
]
|