File size: 4,861 Bytes
ed4d993
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
import re
from typing import Any, Dict, List, Literal, Optional

from langchain_core.callbacks import (
    AsyncCallbackManagerForRetrieverRun,
    CallbackManagerForRetrieverRun,
)
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever


class AskNewsRetriever(BaseRetriever):
    """AskNews retriever."""

    k: int = 10
    offset: int = 0
    start_timestamp: Optional[int] = None
    end_timestamp: Optional[int] = None
    method: Literal["nl", "kw"] = "nl"
    categories: List[
        Literal[
            "All",
            "Business",
            "Crime",
            "Politics",
            "Science",
            "Sports",
            "Technology",
            "Military",
            "Health",
            "Entertainment",
            "Finance",
            "Culture",
            "Climate",
            "Environment",
            "World",
        ]
    ] = ["All"]
    historical: bool = False
    similarity_score_threshold: float = 0.5
    kwargs: Optional[Dict[str, Any]] = {}
    client_id: Optional[str] = None
    client_secret: Optional[str] = None

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        """Get documents relevant to a query.
        Args:
            query: String to find relevant documents for
            run_manager: The callbacks handler to use
        Returns:
            List of relevant documents
        """
        try:
            from asknews_sdk import AskNewsSDK
        except ImportError:
            raise ImportError(
                "AskNews python package not found. "
                "Please install it with `pip install asknews`."
            )
        an_client = AskNewsSDK(
            client_id=self.client_id or os.environ["ASKNEWS_CLIENT_ID"],
            client_secret=self.client_secret or os.environ["ASKNEWS_CLIENT_SECRET"],
            scopes=["news"],
        )
        response = an_client.news.search_news(
            query=query,
            n_articles=self.k,
            start_timestamp=self.start_timestamp,
            end_timestamp=self.end_timestamp,
            method=self.method,
            categories=self.categories,
            historical=self.historical,
            similarity_score_threshold=self.similarity_score_threshold,
            offset=self.offset,
            doc_start_delimiter="<doc>",
            doc_end_delimiter="</doc>",
            return_type="both",
            **self.kwargs,
        )

        return self._extract_documents(response)

    async def _aget_relevant_documents(
        self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun
    ) -> List[Document]:
        """Asynchronously get documents relevant to a query.
        Args:
            query: String to find relevant documents for
            run_manager: The callbacks handler to use
        Returns:
            List of relevant documents
        """
        try:
            from asknews_sdk import AsyncAskNewsSDK
        except ImportError:
            raise ImportError(
                "AskNews python package not found. "
                "Please install it with `pip install asknews`."
            )
        an_client = AsyncAskNewsSDK(
            client_id=self.client_id or os.environ["ASKNEWS_CLIENT_ID"],
            client_secret=self.client_secret or os.environ["ASKNEWS_CLIENT_SECRET"],
            scopes=["news"],
        )
        response = await an_client.news.search_news(
            query=query,
            n_articles=self.k,
            start_timestamp=self.start_timestamp,
            end_timestamp=self.end_timestamp,
            method=self.method,
            categories=self.categories,
            historical=self.historical,
            similarity_score_threshold=self.similarity_score_threshold,
            offset=self.offset,
            return_type="both",
            doc_start_delimiter="<doc>",
            doc_end_delimiter="</doc>",
            **self.kwargs,
        )

        return self._extract_documents(response)

    def _extract_documents(self, response: Any) -> List[Document]:
        """Extract documents from an api response."""

        from asknews_sdk.dto.news import SearchResponse

        sr: SearchResponse = response
        matches = re.findall(r"<doc>(.*?)</doc>", sr.as_string, re.DOTALL)
        docs = [
            Document(
                page_content=matches[i].strip(),
                metadata={
                    "title": sr.as_dicts[i].title,
                    "source": str(sr.as_dicts[i].article_url)
                    if sr.as_dicts[i].article_url
                    else None,
                    "images": sr.as_dicts[i].image_url,
                },
            )
            for i in range(len(matches))
        ]
        return docs