File size: 4,965 Bytes
c576592 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import pandas as pd
from sentence_transformers import SentenceTransformer, util
# from transformers import AutoModelForSequenceClassification, AutoTokenizer
import itertools
import re
import heapq
import torch
from gtts import gTTS
def filter_articles(articles_list, company_name):
"""
Filters articles that only contain the company name.
Args:
articles_list (list): List of dictionaries with 'title' and 'summary'.
company_name (str): The company name to filter articles by.
Returns:
list: A filtered list of articles that contain the company name.
"""
articles_list_filtered = []
for article in articles_list:
full_text = (article["title"] + " " + article["summary"]).lower()
if re.search(company_name.lower(), full_text):
articles_list_filtered.append(article)
return articles_list_filtered
def bs4_extractor(company_name: str):
"""
Extracts news articles from The New York Times and BBC for a given company.
Args:
company_name (str): The name of the company to search for.
Returns:
list: A list of dictionaries containing article titles and summaries.
"""
articles_list = []
# Fetch and parse NYTimes articles
nytimes_url = f"https://www.nytimes.com/search?query={company_name}"
nytimes_page = requests.get(nytimes_url).text
nytimes_soup = BeautifulSoup(nytimes_page, "html.parser")
for article in nytimes_soup.find_all("li", {"data-testid": "search-bodega-result"}):
try:
title = article.find("h4").text.strip()
summary = article.find("p", {"class": "css-e5tzus"}).text.strip()
if not title or not summary:
continue
articles_list.append({"title": title, "summary": summary})
except AttributeError as e:
print(f"NYTimes Extraction Error: {e}")
continue
# Fetch and parse BBC articles
bbc_url = f"https://www.bbc.com/search?q={company_name}"
bbc_page = requests.get(bbc_url).text
bbc_soup = BeautifulSoup(bbc_page, "html.parser")
for article in bbc_soup.find_all("div", {"data-testid": "newport-article"}):
try:
title = article.find("h2", {"data-testid": "card-headline"}).text.strip()
summary = article.find(
"div", {"class": "sc-4ea10043-3 kMizuB"}
).text.strip()
if not title or not summary:
continue
articles_list.append({"title": title, "summary": summary})
except AttributeError as e:
print(f"BBC Extraction Error: {e}")
continue
articles_list = articles_list[:10]
articles_filtered = filter_articles(articles_list, company_name)
return articles_filtered
def save_audio(hindi_text):
tts = gTTS(text=hindi_text, lang="hi", slow=False)
tts.save("output.mp3")
class SentimentAnalyzer:
def __init__(
self, model_id="mrm8488/deberta-v3-ft-financial-news-sentiment-analysis"
):
device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.pipe = pipeline(task="text-classification", model=model_id, device=device)
def classify_sentiments(self, articles_list):
"""
Classifies the sentiment of each article based on its title and summary.
Args:
articles_list (list of dict): A list of articles with 'title' and 'summary' keys.
Returns:
list of dict: A new list with added 'sentiment' keys.
"""
for article in articles_list:
sentiment = self.pipe(f"{article['title']}. {article['summary']}")
article["sentiment"] = sentiment[0]["label"]
return articles_list
class SemanticGrouping:
def __init__(self, model_id="sentence-transformers/all-MiniLM-L6-v2"):
self.model = SentenceTransformer(model_id)
def find_top_k_similar_articles(self, articles, k=5):
"""
Finds the top-k most similar pairs of articles using cosine similarity.
Args:
articles (list of str): A list of article texts to compare.
k (int, optional): The number of top similar pairs to return. Defaults to 5.
Returns:
list of tuples: A list of (index1, index2, similarity_score) tuples.
"""
embeddings = self.model.encode(articles, convert_to_tensor=True)
cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
pairs = itertools.combinations(range(len(articles)), 2)
similarity_scores = [(i, j, cosine_scores[i][j].item()) for i, j in pairs]
top_k_pairs = heapq.nlargest(k, similarity_scores, key=lambda x: x[2])
return top_k_pairs
|