Spaces:

Senzen
/

Back-End

Sleeping

File size: 4,965 Bytes

c576592

import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# from transformers import AutoModelForSequenceClassification, AutoTokenizer
import itertools
import re
import heapq
import torch
from gtts import gTTS


def filter_articles(articles_list, company_name):
    """

    Filters articles that only contain the company name.



    Args:

        articles_list (list): List of dictionaries with 'title' and 'summary'.

        company_name (str): The company name to filter articles by.



    Returns:

        list: A filtered list of articles that contain the company name.

    """
    articles_list_filtered = []

    for article in articles_list:
        full_text = (article["title"] + " " + article["summary"]).lower()

        if re.search(company_name.lower(), full_text):
            articles_list_filtered.append(article)

    return articles_list_filtered


def bs4_extractor(company_name: str):
    """

    Extracts news articles from The New York Times and BBC for a given company.



    Args:

        company_name (str): The name of the company to search for.



    Returns:

        list: A list of dictionaries containing article titles and summaries.

    """
    articles_list = []

    # Fetch and parse NYTimes articles
    nytimes_url = f"https://www.nytimes.com/search?query={company_name}"
    nytimes_page = requests.get(nytimes_url).text
    nytimes_soup = BeautifulSoup(nytimes_page, "html.parser")

    for article in nytimes_soup.find_all("li", {"data-testid": "search-bodega-result"}):
        try:
            title = article.find("h4").text.strip()
            summary = article.find("p", {"class": "css-e5tzus"}).text.strip()

            if not title or not summary:
                continue

            articles_list.append({"title": title, "summary": summary})
        except AttributeError as e:
            print(f"NYTimes Extraction Error: {e}")
            continue

    # Fetch and parse BBC articles
    bbc_url = f"https://www.bbc.com/search?q={company_name}"
    bbc_page = requests.get(bbc_url).text
    bbc_soup = BeautifulSoup(bbc_page, "html.parser")

    for article in bbc_soup.find_all("div", {"data-testid": "newport-article"}):
        try:
            title = article.find("h2", {"data-testid": "card-headline"}).text.strip()
            summary = article.find(
                "div", {"class": "sc-4ea10043-3 kMizuB"}
            ).text.strip()

            if not title or not summary:
                continue

            articles_list.append({"title": title, "summary": summary})
        except AttributeError as e:
            print(f"BBC Extraction Error: {e}")
            continue
    articles_list = articles_list[:10]
    articles_filtered = filter_articles(articles_list, company_name)
    return articles_filtered


def save_audio(hindi_text):
    tts = gTTS(text=hindi_text, lang="hi", slow=False)
    tts.save("output.mp3")


class SentimentAnalyzer:

    def __init__(

        self, model_id="mrm8488/deberta-v3-ft-financial-news-sentiment-analysis"

    ):
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.pipe = pipeline(task="text-classification", model=model_id, device=device)

    def classify_sentiments(self, articles_list):
        """

        Classifies the sentiment of each article based on its title and summary.



        Args:

            articles_list (list of dict): A list of articles with 'title' and 'summary' keys.



        Returns:

            list of dict: A new list with added 'sentiment' keys.

        """
        for article in articles_list:
            sentiment = self.pipe(f"{article['title']}. {article['summary']}")
            article["sentiment"] = sentiment[0]["label"]

        return articles_list


class SemanticGrouping:

    def __init__(self, model_id="sentence-transformers/all-MiniLM-L6-v2"):

        self.model = SentenceTransformer(model_id)

    def find_top_k_similar_articles(self, articles, k=5):
        """

        Finds the top-k most similar pairs of articles using cosine similarity.



        Args:

            articles (list of str): A list of article texts to compare.

            k (int, optional): The number of top similar pairs to return. Defaults to 5.



        Returns:

            list of tuples: A list of (index1, index2, similarity_score) tuples.

        """
        embeddings = self.model.encode(articles, convert_to_tensor=True)
        cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)

        pairs = itertools.combinations(range(len(articles)), 2)
        similarity_scores = [(i, j, cosine_scores[i][j].item()) for i, j in pairs]

        top_k_pairs = heapq.nlargest(k, similarity_scores, key=lambda x: x[2])

        return top_k_pairs