Spaces:
Running
Running
Sam Chaudry
commited on
Commit
·
35311c2
1
Parent(s):
0ed4968
Optimisations
Browse files- media_trust.py +27 -10
media_trust.py
CHANGED
@@ -5,10 +5,15 @@ import datetime
|
|
5 |
import nltk
|
6 |
from datetime import datetime, timedelta
|
7 |
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
8 |
-
|
|
|
|
|
|
|
|
|
9 |
|
10 |
from transformers import pipeline
|
11 |
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
|
|
|
12 |
|
13 |
from dotenv import load_dotenv
|
14 |
import os
|
@@ -60,14 +65,14 @@ def query(topic, sort_by="popularity", max_tokens=100):
|
|
60 |
return None
|
61 |
|
62 |
today = datetime.today()
|
63 |
-
last_week = today - timedelta(days=
|
64 |
from_date = last_week.strftime('%Y-%m-%d')
|
65 |
to_date = today.strftime('%Y-%m-%d')
|
66 |
|
67 |
base_url = "https://newsapi.org/v2/everything"
|
68 |
url = (
|
69 |
f"{base_url}?q={topic}&from={from_date}&to={to_date}"
|
70 |
-
f"&sortBy={sort_by}&pageSize=
|
71 |
)
|
72 |
|
73 |
try:
|
@@ -103,6 +108,10 @@ def query(topic, sort_by="popularity", max_tokens=100):
|
|
103 |
|
104 |
|
105 |
def process_data(df):
|
|
|
|
|
|
|
|
|
106 |
df_cleaned = df.dropna(subset=["title", "description"])
|
107 |
df_cleaned = df_cleaned[df_cleaned["title"].str.strip() != ""]
|
108 |
df_cleaned = df_cleaned[df_cleaned["description"].str.strip() != ""]
|
@@ -110,15 +119,20 @@ def process_data(df):
|
|
110 |
df_cleaned["text"] = df_cleaned["title"] + df_cleaned["description"].str.lower()
|
111 |
return df_cleaned
|
112 |
|
|
|
113 |
def analyse_sentiment(df):
|
114 |
analyser = SentimentIntensityAnalyzer()
|
115 |
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
|
|
|
|
|
|
|
|
120 |
|
121 |
-
def
|
122 |
if score >= 0.05:
|
123 |
return "positive"
|
124 |
elif score <= -0.05:
|
@@ -126,7 +140,7 @@ def analyse_sentiment(df):
|
|
126 |
else:
|
127 |
return "neutral"
|
128 |
|
129 |
-
df['sentiment_label'] = df['compound'].apply(
|
130 |
return df
|
131 |
|
132 |
def get_bias_label(source_name):
|
@@ -175,7 +189,10 @@ def summarise_text(row, max_tokens=512):
|
|
175 |
return pd.Series({'summary': 'Summary unavailable', 'bias_score': 'unknown', 'source': 'unknown'})
|
176 |
|
177 |
def add_article_summaries(df, max_tokens=512):
|
178 |
-
|
|
|
|
|
|
|
179 |
df[['summary', 'bias_score', 'source']] = summary_df
|
180 |
return df
|
181 |
|
|
|
5 |
import nltk
|
6 |
from datetime import datetime, timedelta
|
7 |
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
8 |
+
|
9 |
+
try:
|
10 |
+
nltk.data.find('sentiment/vader_lexicon')
|
11 |
+
except LookupError:
|
12 |
+
nltk.download('vader_lexicon')
|
13 |
|
14 |
from transformers import pipeline
|
15 |
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
|
16 |
+
from concurrent.futures import ThreadPoolExecutor
|
17 |
|
18 |
from dotenv import load_dotenv
|
19 |
import os
|
|
|
65 |
return None
|
66 |
|
67 |
today = datetime.today()
|
68 |
+
last_week = today - timedelta(days=7)
|
69 |
from_date = last_week.strftime('%Y-%m-%d')
|
70 |
to_date = today.strftime('%Y-%m-%d')
|
71 |
|
72 |
base_url = "https://newsapi.org/v2/everything"
|
73 |
url = (
|
74 |
f"{base_url}?q={topic}&from={from_date}&to={to_date}"
|
75 |
+
f"&sortBy={sort_by}&pageSize=10&apiKey={api_key}"
|
76 |
)
|
77 |
|
78 |
try:
|
|
|
108 |
|
109 |
|
110 |
def process_data(df):
|
111 |
+
if df is None or df.empty or not all(col in df.columns for col in ["title", "description"]):
|
112 |
+
print("Invalid or empty DataFrame passed to process_data()")
|
113 |
+
return pd.DataFrame()
|
114 |
+
|
115 |
df_cleaned = df.dropna(subset=["title", "description"])
|
116 |
df_cleaned = df_cleaned[df_cleaned["title"].str.strip() != ""]
|
117 |
df_cleaned = df_cleaned[df_cleaned["description"].str.strip() != ""]
|
|
|
119 |
df_cleaned["text"] = df_cleaned["title"] + df_cleaned["description"].str.lower()
|
120 |
return df_cleaned
|
121 |
|
122 |
+
|
123 |
def analyse_sentiment(df):
|
124 |
analyser = SentimentIntensityAnalyzer()
|
125 |
|
126 |
+
def get_scores(text):
|
127 |
+
scores = analyser.polarity_scores(text)
|
128 |
+
return scores['compound'], scores['neg'], scores['neu'], scores['pos']
|
129 |
+
|
130 |
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
131 |
+
results = list(executor.map(get_scores, df['text']))
|
132 |
+
|
133 |
+
df[['compound', 'neg', 'neu', 'pos']] = results
|
134 |
|
135 |
+
def label_sentiment(score):
|
136 |
if score >= 0.05:
|
137 |
return "positive"
|
138 |
elif score <= -0.05:
|
|
|
140 |
else:
|
141 |
return "neutral"
|
142 |
|
143 |
+
df['sentiment_label'] = df['compound'].apply(label_sentiment)
|
144 |
return df
|
145 |
|
146 |
def get_bias_label(source_name):
|
|
|
189 |
return pd.Series({'summary': 'Summary unavailable', 'bias_score': 'unknown', 'source': 'unknown'})
|
190 |
|
191 |
def add_article_summaries(df, max_tokens=512):
|
192 |
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
193 |
+
summaries = list(executor.map(lambda row: summarise_text(row, max_tokens), df.to_dict('records')))
|
194 |
+
|
195 |
+
summary_df = pd.DataFrame(summaries)
|
196 |
df[['summary', 'bias_score', 'source']] = summary_df
|
197 |
return df
|
198 |
|