Sam Chaudry commited on
Commit
35311c2
·
1 Parent(s): 0ed4968

Optimisations

Browse files
Files changed (1) hide show
  1. media_trust.py +27 -10
media_trust.py CHANGED
@@ -5,10 +5,15 @@ import datetime
5
  import nltk
6
  from datetime import datetime, timedelta
7
  from nltk.sentiment.vader import SentimentIntensityAnalyzer
8
- nltk.download('vader_lexicon')
 
 
 
 
9
 
10
  from transformers import pipeline
11
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
 
12
 
13
  from dotenv import load_dotenv
14
  import os
@@ -60,14 +65,14 @@ def query(topic, sort_by="popularity", max_tokens=100):
60
  return None
61
 
62
  today = datetime.today()
63
- last_week = today - timedelta(days=3)
64
  from_date = last_week.strftime('%Y-%m-%d')
65
  to_date = today.strftime('%Y-%m-%d')
66
 
67
  base_url = "https://newsapi.org/v2/everything"
68
  url = (
69
  f"{base_url}?q={topic}&from={from_date}&to={to_date}"
70
- f"&sortBy={sort_by}&pageSize=5&apiKey={api_key}"
71
  )
72
 
73
  try:
@@ -103,6 +108,10 @@ def query(topic, sort_by="popularity", max_tokens=100):
103
 
104
 
105
  def process_data(df):
 
 
 
 
106
  df_cleaned = df.dropna(subset=["title", "description"])
107
  df_cleaned = df_cleaned[df_cleaned["title"].str.strip() != ""]
108
  df_cleaned = df_cleaned[df_cleaned["description"].str.strip() != ""]
@@ -110,15 +119,20 @@ def process_data(df):
110
  df_cleaned["text"] = df_cleaned["title"] + df_cleaned["description"].str.lower()
111
  return df_cleaned
112
 
 
113
  def analyse_sentiment(df):
114
  analyser = SentimentIntensityAnalyzer()
115
 
116
- df['compound'] = [analyser.polarity_scores(x)['compound'] for x in df['text']]
117
- df['neg'] = [analyser.polarity_scores(x)['neg'] for x in df['text']]
118
- df['neu'] = [analyser.polarity_scores(x)['neu'] for x in df['text']]
119
- df['pos'] = [analyser.polarity_scores(x)['pos'] for x in df['text']]
 
 
 
 
120
 
121
- def label(score):
122
  if score >= 0.05:
123
  return "positive"
124
  elif score <= -0.05:
@@ -126,7 +140,7 @@ def analyse_sentiment(df):
126
  else:
127
  return "neutral"
128
 
129
- df['sentiment_label'] = df['compound'].apply(label)
130
  return df
131
 
132
  def get_bias_label(source_name):
@@ -175,7 +189,10 @@ def summarise_text(row, max_tokens=512):
175
  return pd.Series({'summary': 'Summary unavailable', 'bias_score': 'unknown', 'source': 'unknown'})
176
 
177
  def add_article_summaries(df, max_tokens=512):
178
- summary_df = df.apply(summarise_text, axis=1, max_tokens=max_tokens)
 
 
 
179
  df[['summary', 'bias_score', 'source']] = summary_df
180
  return df
181
 
 
5
  import nltk
6
  from datetime import datetime, timedelta
7
  from nltk.sentiment.vader import SentimentIntensityAnalyzer
8
+
9
+ try:
10
+ nltk.data.find('sentiment/vader_lexicon')
11
+ except LookupError:
12
+ nltk.download('vader_lexicon')
13
 
14
  from transformers import pipeline
15
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
16
+ from concurrent.futures import ThreadPoolExecutor
17
 
18
  from dotenv import load_dotenv
19
  import os
 
65
  return None
66
 
67
  today = datetime.today()
68
+ last_week = today - timedelta(days=7)
69
  from_date = last_week.strftime('%Y-%m-%d')
70
  to_date = today.strftime('%Y-%m-%d')
71
 
72
  base_url = "https://newsapi.org/v2/everything"
73
  url = (
74
  f"{base_url}?q={topic}&from={from_date}&to={to_date}"
75
+ f"&sortBy={sort_by}&pageSize=10&apiKey={api_key}"
76
  )
77
 
78
  try:
 
108
 
109
 
110
  def process_data(df):
111
+ if df is None or df.empty or not all(col in df.columns for col in ["title", "description"]):
112
+ print("Invalid or empty DataFrame passed to process_data()")
113
+ return pd.DataFrame()
114
+
115
  df_cleaned = df.dropna(subset=["title", "description"])
116
  df_cleaned = df_cleaned[df_cleaned["title"].str.strip() != ""]
117
  df_cleaned = df_cleaned[df_cleaned["description"].str.strip() != ""]
 
119
  df_cleaned["text"] = df_cleaned["title"] + df_cleaned["description"].str.lower()
120
  return df_cleaned
121
 
122
+
123
  def analyse_sentiment(df):
124
  analyser = SentimentIntensityAnalyzer()
125
 
126
+ def get_scores(text):
127
+ scores = analyser.polarity_scores(text)
128
+ return scores['compound'], scores['neg'], scores['neu'], scores['pos']
129
+
130
+ with ThreadPoolExecutor(max_workers=4) as executor:
131
+ results = list(executor.map(get_scores, df['text']))
132
+
133
+ df[['compound', 'neg', 'neu', 'pos']] = results
134
 
135
+ def label_sentiment(score):
136
  if score >= 0.05:
137
  return "positive"
138
  elif score <= -0.05:
 
140
  else:
141
  return "neutral"
142
 
143
+ df['sentiment_label'] = df['compound'].apply(label_sentiment)
144
  return df
145
 
146
  def get_bias_label(source_name):
 
189
  return pd.Series({'summary': 'Summary unavailable', 'bias_score': 'unknown', 'source': 'unknown'})
190
 
191
  def add_article_summaries(df, max_tokens=512):
192
+ with ThreadPoolExecutor(max_workers=4) as executor:
193
+ summaries = list(executor.map(lambda row: summarise_text(row, max_tokens), df.to_dict('records')))
194
+
195
+ summary_df = pd.DataFrame(summaries)
196
  df[['summary', 'bias_score', 'source']] = summary_df
197
  return df
198