Sam Chaudry commited on
Commit
2228634
·
1 Parent(s): addef69

Optimisations

Browse files
Files changed (1) hide show
  1. media_trust.py +58 -57
media_trust.py CHANGED
@@ -12,6 +12,7 @@ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
12
 
13
  from dotenv import load_dotenv
14
  import os
 
15
 
16
  load_dotenv()
17
 
@@ -53,45 +54,52 @@ BIAS_SCORE_MAP = {
53
  "unknown": 0
54
  }
55
 
56
- def query(query, sort_by="popularity", max_tokens=100):
 
 
 
57
 
58
- if query == "":
59
- print("Topic needs to be passed in")
60
- return
61
-
62
  today = datetime.today()
63
- seven_days_ago = today - timedelta(days=7)
64
- from_date = seven_days_ago.strftime('%Y-%m-%d')
65
  to_date = today.strftime('%Y-%m-%d')
66
-
67
  base_url = "https://newsapi.org/v2/everything"
68
- url = f"{base_url}?q={query}&from={from_date}&to={to_date}&sortBy={sort_by}&apiKey={api_key}"
69
- news = None
 
 
70
 
71
  try:
72
- news_response = requests.get(url, timeout=10)
73
- if news_response.status_code == 200:
74
- news = news_response.json()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- else:
77
- print("API error has occured", news_response.status_code)
78
- except Exception:
79
- print('An exception occurred')
80
-
81
- article_arr = news["articles"]
82
- extracted_data = []
83
-
84
- for article in article_arr:
85
- extracted_data.append({
86
- "title": article.get("title", "N/A"),
87
- "description": article.get("description", "N/A"),
88
- "source_name": article.get("source", {}).get("name", "N/A"),
89
- "url": article.get("url", "N/A"),
90
- "publishedAt": article.get("publishedAt", "N/A")
91
- })
92
-
93
- df = pd.DataFrame(extracted_data)
94
- return df
95
 
96
 
97
  def process_data(df):
@@ -103,15 +111,14 @@ def process_data(df):
103
  return df_cleaned
104
 
105
  def analyse_sentiment(df):
106
-
107
  analyser = SentimentIntensityAnalyzer()
108
-
109
  df['compound'] = [analyser.polarity_scores(x)['compound'] for x in df['text']]
110
  df['neg'] = [analyser.polarity_scores(x)['neg'] for x in df['text']]
111
  df['neu'] = [analyser.polarity_scores(x)['neu'] for x in df['text']]
112
  df['pos'] = [analyser.polarity_scores(x)['pos'] for x in df['text']]
113
-
114
- def label_sentiment(score):
115
  if score >= 0.05:
116
  return "positive"
117
  elif score <= -0.05:
@@ -119,15 +126,16 @@ def analyse_sentiment(df):
119
  else:
120
  return "neutral"
121
 
122
- df['sentiment_label'] = df['compound'].apply(label_sentiment)
123
  return df
124
 
125
  def get_bias_label(source_name):
126
- source = source_name.strip().lower()
127
- return SOURCE_BIAS_MAP.get(source, "unknown")
128
 
129
  def add_bias_annotation(df):
130
- df['bias_label'] = df['source_name'].apply(get_bias_label)
 
131
  return df
132
 
133
  def set_article_extremity(df, top_n=5):
@@ -153,31 +161,18 @@ def summarise_text(row, max_tokens=512):
153
  source_name = row['source_name'] if 'source_name' in row and pd.notna(row['source_name']) else 'unknown'
154
 
155
  input_length = len(text.split())
156
-
157
- if input_length < 40:
158
- max_length = max(10, int(input_length / 2))
159
- else:
160
- max_length = min(input_length - 10, max_tokens)
161
  min_length = max(10, max_length - 10)
162
 
163
  summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
164
  summary_text = summary[0]['summary_text']
165
-
166
  bias_label = get_bias_label(source_name)
167
 
168
- return pd.Series({
169
- 'summary': summary_text,
170
- 'bias_score': bias_label,
171
- 'source': source_name
172
- })
173
 
174
  except Exception as e:
175
  print(f"Error summarising row: {e}")
176
- return pd.Series({
177
- 'summary': 'Summary unavailable',
178
- 'bias_score': 'unknown',
179
- 'source': 'unknown'
180
- })
181
 
182
  def add_article_summaries(df, max_tokens=512):
183
  summary_df = df.apply(summarise_text, axis=1, max_tokens=max_tokens)
@@ -186,11 +181,17 @@ def add_article_summaries(df, max_tokens=512):
186
 
187
  def main():
188
  raw_df = query("Tesla")
 
 
 
 
189
  processed_df = process_data(raw_df)
190
- sentiment_df = analyse_sentiment(processed_df)
 
191
  bias_df = add_bias_annotation(sentiment_df)
192
  extremity_df = set_article_extremity(bias_df)
193
  final_df = add_article_summaries(extremity_df)
 
194
 
195
  if __name__ == "__main__":
196
  main()
 
12
 
13
  from dotenv import load_dotenv
14
  import os
15
+ from concurrent.futures import ThreadPoolExecutor
16
 
17
  load_dotenv()
18
 
 
54
  "unknown": 0
55
  }
56
 
57
+ def query(topic, sort_by="popularity", max_tokens=100):
58
+ if not topic:
59
+ print("Topic must be provided.")
60
+ return None
61
 
 
 
 
 
62
  today = datetime.today()
63
+ last_week = today - timedelta(days=7)
64
+ from_date = last_week.strftime('%Y-%m-%d')
65
  to_date = today.strftime('%Y-%m-%d')
66
+
67
  base_url = "https://newsapi.org/v2/everything"
68
+ url = (
69
+ f"{base_url}?q={topic}&from={from_date}&to={to_date}"
70
+ f"&sortBy={sort_by}&pageSize=20&apiKey={api_key}"
71
+ )
72
 
73
  try:
74
+ response = requests.get(url, timeout=10)
75
+ if response.status_code != 200:
76
+ print(f"API returned error: {response.status_code}")
77
+ return None
78
+
79
+ data = response.json()
80
+
81
+ if data.get("totalResults", 0) == 0:
82
+ print("No articles found for the given query and date range.")
83
+ return None
84
+
85
+ articles = data.get("articles", [])
86
+ extracted = [
87
+ {
88
+ "title": article.get("title", "N/A"),
89
+ "description": article.get("description", "N/A"),
90
+ "source_name": article.get("source", {}).get("name", "N/A"),
91
+ "url": article.get("url", "N/A"),
92
+ "publishedAt": article.get("publishedAt", "N/A"),
93
+ }
94
+ for article in articles
95
+ ]
96
+
97
+ return pd.DataFrame(extracted)
98
+
99
+ except Exception as e:
100
+ print(f"An error occurred: {e}")
101
+ return None
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
 
105
  def process_data(df):
 
111
  return df_cleaned
112
 
113
  def analyse_sentiment(df):
 
114
  analyser = SentimentIntensityAnalyzer()
115
+
116
  df['compound'] = [analyser.polarity_scores(x)['compound'] for x in df['text']]
117
  df['neg'] = [analyser.polarity_scores(x)['neg'] for x in df['text']]
118
  df['neu'] = [analyser.polarity_scores(x)['neu'] for x in df['text']]
119
  df['pos'] = [analyser.polarity_scores(x)['pos'] for x in df['text']]
120
+
121
+ def label(score):
122
  if score >= 0.05:
123
  return "positive"
124
  elif score <= -0.05:
 
126
  else:
127
  return "neutral"
128
 
129
+ df['sentiment_label'] = df['compound'].apply(label)
130
  return df
131
 
132
  def get_bias_label(source_name):
133
+ source = source_name.strip().lower()
134
+ return SOURCE_BIAS_MAP.get(source, "unknown")
135
 
136
  def add_bias_annotation(df):
137
+ bias_series = pd.Series(SOURCE_BIAS_MAP)
138
+ df['bias_label'] = df['source_name'].str.strip().str.lower().map(bias_series).fillna("unknown")
139
  return df
140
 
141
  def set_article_extremity(df, top_n=5):
 
161
  source_name = row['source_name'] if 'source_name' in row and pd.notna(row['source_name']) else 'unknown'
162
 
163
  input_length = len(text.split())
164
+ max_length = min(input_length - 10, max_tokens)
 
 
 
 
165
  min_length = max(10, max_length - 10)
166
 
167
  summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
168
  summary_text = summary[0]['summary_text']
 
169
  bias_label = get_bias_label(source_name)
170
 
171
+ return pd.Series({'summary': summary_text, 'bias_score': bias_label, 'source': source_name})
 
 
 
 
172
 
173
  except Exception as e:
174
  print(f"Error summarising row: {e}")
175
+ return pd.Series({'summary': 'Summary unavailable', 'bias_score': 'unknown', 'source': 'unknown'})
 
 
 
 
176
 
177
  def add_article_summaries(df, max_tokens=512):
178
  summary_df = df.apply(summarise_text, axis=1, max_tokens=max_tokens)
 
181
 
182
  def main():
183
  raw_df = query("Tesla")
184
+ if raw_df is None or raw_df.empty:
185
+ print("No data found!")
186
+ return
187
+
188
  processed_df = process_data(raw_df)
189
+ analyser = SentimentIntensityAnalyzer()
190
+ sentiment_df = analyse_sentiment(processed_df, analyser)
191
  bias_df = add_bias_annotation(sentiment_df)
192
  extremity_df = set_article_extremity(bias_df)
193
  final_df = add_article_summaries(extremity_df)
194
+ print(final_df.head())
195
 
196
  if __name__ == "__main__":
197
  main()