File size: 4,965 Bytes
c576592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# from transformers import AutoModelForSequenceClassification, AutoTokenizer
import itertools
import re
import heapq
import torch
from gtts import gTTS


def filter_articles(articles_list, company_name):
    """

    Filters articles that only contain the company name.



    Args:

        articles_list (list): List of dictionaries with 'title' and 'summary'.

        company_name (str): The company name to filter articles by.



    Returns:

        list: A filtered list of articles that contain the company name.

    """
    articles_list_filtered = []

    for article in articles_list:
        full_text = (article["title"] + " " + article["summary"]).lower()

        if re.search(company_name.lower(), full_text):
            articles_list_filtered.append(article)

    return articles_list_filtered


def bs4_extractor(company_name: str):
    """

    Extracts news articles from The New York Times and BBC for a given company.



    Args:

        company_name (str): The name of the company to search for.



    Returns:

        list: A list of dictionaries containing article titles and summaries.

    """
    articles_list = []

    # Fetch and parse NYTimes articles
    nytimes_url = f"https://www.nytimes.com/search?query={company_name}"
    nytimes_page = requests.get(nytimes_url).text
    nytimes_soup = BeautifulSoup(nytimes_page, "html.parser")

    for article in nytimes_soup.find_all("li", {"data-testid": "search-bodega-result"}):
        try:
            title = article.find("h4").text.strip()
            summary = article.find("p", {"class": "css-e5tzus"}).text.strip()

            if not title or not summary:
                continue

            articles_list.append({"title": title, "summary": summary})
        except AttributeError as e:
            print(f"NYTimes Extraction Error: {e}")
            continue

    # Fetch and parse BBC articles
    bbc_url = f"https://www.bbc.com/search?q={company_name}"
    bbc_page = requests.get(bbc_url).text
    bbc_soup = BeautifulSoup(bbc_page, "html.parser")

    for article in bbc_soup.find_all("div", {"data-testid": "newport-article"}):
        try:
            title = article.find("h2", {"data-testid": "card-headline"}).text.strip()
            summary = article.find(
                "div", {"class": "sc-4ea10043-3 kMizuB"}
            ).text.strip()

            if not title or not summary:
                continue

            articles_list.append({"title": title, "summary": summary})
        except AttributeError as e:
            print(f"BBC Extraction Error: {e}")
            continue
    articles_list = articles_list[:10]
    articles_filtered = filter_articles(articles_list, company_name)
    return articles_filtered


def save_audio(hindi_text):
    tts = gTTS(text=hindi_text, lang="hi", slow=False)
    tts.save("output.mp3")


class SentimentAnalyzer:

    def __init__(

        self, model_id="mrm8488/deberta-v3-ft-financial-news-sentiment-analysis"

    ):
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.pipe = pipeline(task="text-classification", model=model_id, device=device)

    def classify_sentiments(self, articles_list):
        """

        Classifies the sentiment of each article based on its title and summary.



        Args:

            articles_list (list of dict): A list of articles with 'title' and 'summary' keys.



        Returns:

            list of dict: A new list with added 'sentiment' keys.

        """
        for article in articles_list:
            sentiment = self.pipe(f"{article['title']}. {article['summary']}")
            article["sentiment"] = sentiment[0]["label"]

        return articles_list


class SemanticGrouping:

    def __init__(self, model_id="sentence-transformers/all-MiniLM-L6-v2"):

        self.model = SentenceTransformer(model_id)

    def find_top_k_similar_articles(self, articles, k=5):
        """

        Finds the top-k most similar pairs of articles using cosine similarity.



        Args:

            articles (list of str): A list of article texts to compare.

            k (int, optional): The number of top similar pairs to return. Defaults to 5.



        Returns:

            list of tuples: A list of (index1, index2, similarity_score) tuples.

        """
        embeddings = self.model.encode(articles, convert_to_tensor=True)
        cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)

        pairs = itertools.combinations(range(len(articles)), 2)
        similarity_scores = [(i, j, cosine_scores[i][j].item()) for i, j in pairs]

        top_k_pairs = heapq.nlargest(k, similarity_scores, key=lambda x: x[2])

        return top_k_pairs