Spaces:
Running
Running
Sam Chaudry
commited on
Commit
·
2228634
1
Parent(s):
addef69
Optimisations
Browse files- media_trust.py +58 -57
media_trust.py
CHANGED
@@ -12,6 +12,7 @@ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
|
|
12 |
|
13 |
from dotenv import load_dotenv
|
14 |
import os
|
|
|
15 |
|
16 |
load_dotenv()
|
17 |
|
@@ -53,45 +54,52 @@ BIAS_SCORE_MAP = {
|
|
53 |
"unknown": 0
|
54 |
}
|
55 |
|
56 |
-
def query(
|
|
|
|
|
|
|
57 |
|
58 |
-
if query == "":
|
59 |
-
print("Topic needs to be passed in")
|
60 |
-
return
|
61 |
-
|
62 |
today = datetime.today()
|
63 |
-
|
64 |
-
from_date =
|
65 |
to_date = today.strftime('%Y-%m-%d')
|
66 |
-
|
67 |
base_url = "https://newsapi.org/v2/everything"
|
68 |
-
url =
|
69 |
-
|
|
|
|
|
70 |
|
71 |
try:
|
72 |
-
|
73 |
-
if
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
-
else:
|
77 |
-
print("API error has occured", news_response.status_code)
|
78 |
-
except Exception:
|
79 |
-
print('An exception occurred')
|
80 |
-
|
81 |
-
article_arr = news["articles"]
|
82 |
-
extracted_data = []
|
83 |
-
|
84 |
-
for article in article_arr:
|
85 |
-
extracted_data.append({
|
86 |
-
"title": article.get("title", "N/A"),
|
87 |
-
"description": article.get("description", "N/A"),
|
88 |
-
"source_name": article.get("source", {}).get("name", "N/A"),
|
89 |
-
"url": article.get("url", "N/A"),
|
90 |
-
"publishedAt": article.get("publishedAt", "N/A")
|
91 |
-
})
|
92 |
-
|
93 |
-
df = pd.DataFrame(extracted_data)
|
94 |
-
return df
|
95 |
|
96 |
|
97 |
def process_data(df):
|
@@ -103,15 +111,14 @@ def process_data(df):
|
|
103 |
return df_cleaned
|
104 |
|
105 |
def analyse_sentiment(df):
|
106 |
-
|
107 |
analyser = SentimentIntensityAnalyzer()
|
108 |
-
|
109 |
df['compound'] = [analyser.polarity_scores(x)['compound'] for x in df['text']]
|
110 |
df['neg'] = [analyser.polarity_scores(x)['neg'] for x in df['text']]
|
111 |
df['neu'] = [analyser.polarity_scores(x)['neu'] for x in df['text']]
|
112 |
df['pos'] = [analyser.polarity_scores(x)['pos'] for x in df['text']]
|
113 |
-
|
114 |
-
def
|
115 |
if score >= 0.05:
|
116 |
return "positive"
|
117 |
elif score <= -0.05:
|
@@ -119,15 +126,16 @@ def analyse_sentiment(df):
|
|
119 |
else:
|
120 |
return "neutral"
|
121 |
|
122 |
-
df['sentiment_label'] = df['compound'].apply(
|
123 |
return df
|
124 |
|
125 |
def get_bias_label(source_name):
|
126 |
-
|
127 |
-
|
128 |
|
129 |
def add_bias_annotation(df):
|
130 |
-
|
|
|
131 |
return df
|
132 |
|
133 |
def set_article_extremity(df, top_n=5):
|
@@ -153,31 +161,18 @@ def summarise_text(row, max_tokens=512):
|
|
153 |
source_name = row['source_name'] if 'source_name' in row and pd.notna(row['source_name']) else 'unknown'
|
154 |
|
155 |
input_length = len(text.split())
|
156 |
-
|
157 |
-
if input_length < 40:
|
158 |
-
max_length = max(10, int(input_length / 2))
|
159 |
-
else:
|
160 |
-
max_length = min(input_length - 10, max_tokens)
|
161 |
min_length = max(10, max_length - 10)
|
162 |
|
163 |
summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
|
164 |
summary_text = summary[0]['summary_text']
|
165 |
-
|
166 |
bias_label = get_bias_label(source_name)
|
167 |
|
168 |
-
return pd.Series({
|
169 |
-
'summary': summary_text,
|
170 |
-
'bias_score': bias_label,
|
171 |
-
'source': source_name
|
172 |
-
})
|
173 |
|
174 |
except Exception as e:
|
175 |
print(f"Error summarising row: {e}")
|
176 |
-
return pd.Series({
|
177 |
-
'summary': 'Summary unavailable',
|
178 |
-
'bias_score': 'unknown',
|
179 |
-
'source': 'unknown'
|
180 |
-
})
|
181 |
|
182 |
def add_article_summaries(df, max_tokens=512):
|
183 |
summary_df = df.apply(summarise_text, axis=1, max_tokens=max_tokens)
|
@@ -186,11 +181,17 @@ def add_article_summaries(df, max_tokens=512):
|
|
186 |
|
187 |
def main():
|
188 |
raw_df = query("Tesla")
|
|
|
|
|
|
|
|
|
189 |
processed_df = process_data(raw_df)
|
190 |
-
|
|
|
191 |
bias_df = add_bias_annotation(sentiment_df)
|
192 |
extremity_df = set_article_extremity(bias_df)
|
193 |
final_df = add_article_summaries(extremity_df)
|
|
|
194 |
|
195 |
if __name__ == "__main__":
|
196 |
main()
|
|
|
12 |
|
13 |
from dotenv import load_dotenv
|
14 |
import os
|
15 |
+
from concurrent.futures import ThreadPoolExecutor
|
16 |
|
17 |
load_dotenv()
|
18 |
|
|
|
54 |
"unknown": 0
|
55 |
}
|
56 |
|
57 |
+
def query(topic, sort_by="popularity", max_tokens=100):
|
58 |
+
if not topic:
|
59 |
+
print("Topic must be provided.")
|
60 |
+
return None
|
61 |
|
|
|
|
|
|
|
|
|
62 |
today = datetime.today()
|
63 |
+
last_week = today - timedelta(days=7)
|
64 |
+
from_date = last_week.strftime('%Y-%m-%d')
|
65 |
to_date = today.strftime('%Y-%m-%d')
|
66 |
+
|
67 |
base_url = "https://newsapi.org/v2/everything"
|
68 |
+
url = (
|
69 |
+
f"{base_url}?q={topic}&from={from_date}&to={to_date}"
|
70 |
+
f"&sortBy={sort_by}&pageSize=20&apiKey={api_key}"
|
71 |
+
)
|
72 |
|
73 |
try:
|
74 |
+
response = requests.get(url, timeout=10)
|
75 |
+
if response.status_code != 200:
|
76 |
+
print(f"API returned error: {response.status_code}")
|
77 |
+
return None
|
78 |
+
|
79 |
+
data = response.json()
|
80 |
+
|
81 |
+
if data.get("totalResults", 0) == 0:
|
82 |
+
print("No articles found for the given query and date range.")
|
83 |
+
return None
|
84 |
+
|
85 |
+
articles = data.get("articles", [])
|
86 |
+
extracted = [
|
87 |
+
{
|
88 |
+
"title": article.get("title", "N/A"),
|
89 |
+
"description": article.get("description", "N/A"),
|
90 |
+
"source_name": article.get("source", {}).get("name", "N/A"),
|
91 |
+
"url": article.get("url", "N/A"),
|
92 |
+
"publishedAt": article.get("publishedAt", "N/A"),
|
93 |
+
}
|
94 |
+
for article in articles
|
95 |
+
]
|
96 |
+
|
97 |
+
return pd.DataFrame(extracted)
|
98 |
+
|
99 |
+
except Exception as e:
|
100 |
+
print(f"An error occurred: {e}")
|
101 |
+
return None
|
102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
|
105 |
def process_data(df):
|
|
|
111 |
return df_cleaned
|
112 |
|
113 |
def analyse_sentiment(df):
|
|
|
114 |
analyser = SentimentIntensityAnalyzer()
|
115 |
+
|
116 |
df['compound'] = [analyser.polarity_scores(x)['compound'] for x in df['text']]
|
117 |
df['neg'] = [analyser.polarity_scores(x)['neg'] for x in df['text']]
|
118 |
df['neu'] = [analyser.polarity_scores(x)['neu'] for x in df['text']]
|
119 |
df['pos'] = [analyser.polarity_scores(x)['pos'] for x in df['text']]
|
120 |
+
|
121 |
+
def label(score):
|
122 |
if score >= 0.05:
|
123 |
return "positive"
|
124 |
elif score <= -0.05:
|
|
|
126 |
else:
|
127 |
return "neutral"
|
128 |
|
129 |
+
df['sentiment_label'] = df['compound'].apply(label)
|
130 |
return df
|
131 |
|
132 |
def get_bias_label(source_name):
|
133 |
+
source = source_name.strip().lower()
|
134 |
+
return SOURCE_BIAS_MAP.get(source, "unknown")
|
135 |
|
136 |
def add_bias_annotation(df):
|
137 |
+
bias_series = pd.Series(SOURCE_BIAS_MAP)
|
138 |
+
df['bias_label'] = df['source_name'].str.strip().str.lower().map(bias_series).fillna("unknown")
|
139 |
return df
|
140 |
|
141 |
def set_article_extremity(df, top_n=5):
|
|
|
161 |
source_name = row['source_name'] if 'source_name' in row and pd.notna(row['source_name']) else 'unknown'
|
162 |
|
163 |
input_length = len(text.split())
|
164 |
+
max_length = min(input_length - 10, max_tokens)
|
|
|
|
|
|
|
|
|
165 |
min_length = max(10, max_length - 10)
|
166 |
|
167 |
summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
|
168 |
summary_text = summary[0]['summary_text']
|
|
|
169 |
bias_label = get_bias_label(source_name)
|
170 |
|
171 |
+
return pd.Series({'summary': summary_text, 'bias_score': bias_label, 'source': source_name})
|
|
|
|
|
|
|
|
|
172 |
|
173 |
except Exception as e:
|
174 |
print(f"Error summarising row: {e}")
|
175 |
+
return pd.Series({'summary': 'Summary unavailable', 'bias_score': 'unknown', 'source': 'unknown'})
|
|
|
|
|
|
|
|
|
176 |
|
177 |
def add_article_summaries(df, max_tokens=512):
|
178 |
summary_df = df.apply(summarise_text, axis=1, max_tokens=max_tokens)
|
|
|
181 |
|
182 |
def main():
|
183 |
raw_df = query("Tesla")
|
184 |
+
if raw_df is None or raw_df.empty:
|
185 |
+
print("No data found!")
|
186 |
+
return
|
187 |
+
|
188 |
processed_df = process_data(raw_df)
|
189 |
+
analyser = SentimentIntensityAnalyzer()
|
190 |
+
sentiment_df = analyse_sentiment(processed_df, analyser)
|
191 |
bias_df = add_bias_annotation(sentiment_df)
|
192 |
extremity_df = set_article_extremity(bias_df)
|
193 |
final_df = add_article_summaries(extremity_df)
|
194 |
+
print(final_df.head())
|
195 |
|
196 |
if __name__ == "__main__":
|
197 |
main()
|