File size: 5,171 Bytes
ed4d993
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""Util that calls Google Scholar Search."""
from typing import Dict, Optional

from langchain_core.pydantic_v1 import BaseModel, Extra, root_validator
from langchain_core.utils import get_from_dict_or_env


class GoogleScholarAPIWrapper(BaseModel):
    """Wrapper for Google Scholar API

    You can create serpapi key by signing up at: https://serpapi.com/users/sign_up.

    The wrapper uses the serpapi python package:
    https://serpapi.com/integrations/python#search-google-scholar

    To use, you should have the environment variable ``SERP_API_KEY``
    set with your API key, or pass `serp_api_key` as a named parameter
    to the constructor.

    Attributes:
        top_k_results: number of results to return from google-scholar query search.
            By default it returns top 10 results.
        hl: attribute defines the language to use for the Google Scholar search.
            It's a two-letter language code.
            (e.g., en for English, es for Spanish, or fr for French). Head to the
            Google languages page for a full list of supported Google languages:
            https://serpapi.com/google-languages

        lr: attribute defines one or multiple languages to limit the search to.
            It uses lang_{two-letter language code} to specify languages
            and | as a delimiter. (e.g., lang_fr|lang_de will only search French
            and German pages). Head to the Google lr languages for a full
            list of supported languages: https://serpapi.com/google-lr-languages

     Example:
        .. code-block:: python

        from langchain_community.utilities import GoogleScholarAPIWrapper
        google_scholar = GoogleScholarAPIWrapper()
        google_scholar.run('langchain')
    """

    top_k_results: int = 10
    hl: str = "en"
    lr: str = "lang_en"
    serp_api_key: Optional[str] = None

    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid

    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that api key and python package exists in environment."""
        serp_api_key = get_from_dict_or_env(values, "serp_api_key", "SERP_API_KEY")
        values["SERP_API_KEY"] = serp_api_key

        try:
            from serpapi import GoogleScholarSearch

        except ImportError:
            raise ImportError(
                "google-search-results is not installed. "
                "Please install it with `pip install google-search-results"
                ">=2.4.2`"
            )
        GoogleScholarSearch.SERP_API_KEY = serp_api_key
        values["google_scholar_engine"] = GoogleScholarSearch

        return values

    def run(self, query: str) -> str:
        """Run query through GoogleSearchScholar and parse result"""
        total_results = []
        page = 0
        while page < max((self.top_k_results - 20), 1):
            # We are getting 20 results from every page
            # which is the max in order to reduce the number of API CALLS.
            # 0 is the first page of results, 20 is the 2nd page of results,
            # 40 is the 3rd page of results, etc.
            results = (
                self.google_scholar_engine(  # type: ignore
                    {
                        "q": query,
                        "start": page,
                        "hl": self.hl,
                        "num": min(
                            self.top_k_results, 20
                        ),  # if top_k_result is less than 20.
                        "lr": self.lr,
                    }
                )
                .get_dict()
                .get("organic_results", [])
            )
            total_results.extend(results)
            if not results:  # No need to search for more pages if current page
                # has returned no results
                break
            page += 20
        if (
            self.top_k_results % 20 != 0 and page > 20 and total_results
        ):  # From the last page we would only need top_k_results%20 results
            # if k is not divisible by 20.
            results = (
                self.google_scholar_engine(  # type: ignore
                    {
                        "q": query,
                        "start": page,
                        "num": self.top_k_results % 20,
                        "hl": self.hl,
                        "lr": self.lr,
                    }
                )
                .get_dict()
                .get("organic_results", [])
            )
            total_results.extend(results)
        if not total_results:
            return "No good Google Scholar Result was found"
        docs = [
            f"Title: {result.get('title','')}\n"
            f"Authors: {','.join([author.get('name') for author in result.get('publication_info',{}).get('authors',[])])}\n"  # noqa: E501
            f"Summary: {result.get('publication_info',{}).get('summary','')}\n"
            f"Total-Citations: {result.get('inline_links',{}).get('cited_by',{}).get('total','')}"  # noqa: E501
            for result in total_results
        ]
        return "\n\n".join(docs)