Spaces:
Running
Running
Sam Chaudry
commited on
Commit
·
4d234fe
1
Parent(s):
1139f00
Initial commit with project files
Browse files- app.py +20 -0
- media_trust.py +196 -0
- requirements.txt +147 -0
app.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from media_trust import query, process_data, analyse_sentiment, add_bias_annotation, set_article_extremity, add_article_summaries
|
4 |
+
|
5 |
+
def process_news(topic):
|
6 |
+
raw_df = query(topic)
|
7 |
+
processed_df = process_data(raw_df)
|
8 |
+
sentiment_df = analyse_sentiment(processed_df)
|
9 |
+
bias_df = add_bias_annotation(sentiment_df)
|
10 |
+
extremity_df = set_article_extremity(bias_df)
|
11 |
+
final_df = add_article_summaries(extremity_df)
|
12 |
+
return final_df[['title', 'summary', 'bias_score', 'extremity_pct', 'source']]
|
13 |
+
|
14 |
+
with gr.Blocks() as interface:
|
15 |
+
with gr.Column():
|
16 |
+
topic_input = gr.Textbox(label="Enter a topic", placeholder="e.g., Tesla")
|
17 |
+
output_table = gr.DataFrame(headers=["Title", "Summary", "Bias", "Extremity %", "Source"], interactive=False)
|
18 |
+
topic_input.submit(process_news, inputs=topic_input, outputs=output_table)
|
19 |
+
|
20 |
+
interface.launch()
|
media_trust.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import pandas as pd
|
3 |
+
import gradio as gr
|
4 |
+
import datetime
|
5 |
+
import nltk
|
6 |
+
from datetime import datetime, timedelta
|
7 |
+
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
8 |
+
nltk.download('vader_lexicon')
|
9 |
+
|
10 |
+
from transformers import pipeline
|
11 |
+
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
|
12 |
+
|
13 |
+
from dotenv import load_dotenv
|
14 |
+
import os
|
15 |
+
|
16 |
+
load_dotenv()
|
17 |
+
|
18 |
+
api_key = os.getenv("API_KEY")
|
19 |
+
|
20 |
+
if not api_key:
|
21 |
+
raise ValueError("API_KEY not found. Make sure to set it in the .env file.")
|
22 |
+
|
23 |
+
SOURCE_BIAS_MAP = {
|
24 |
+
"fox news": "right",
|
25 |
+
"breitbart": "right",
|
26 |
+
"new york post": "right",
|
27 |
+
"the wall street journal": "center-right",
|
28 |
+
"reuters": "center",
|
29 |
+
"associated press": "center",
|
30 |
+
"bloomberg": "center",
|
31 |
+
"npr": "center-left",
|
32 |
+
"cnn": "left",
|
33 |
+
"msnbc": "left",
|
34 |
+
"the new york times": "left",
|
35 |
+
"the washington post": "left",
|
36 |
+
"the guardian": "left",
|
37 |
+
"bbc news": "center",
|
38 |
+
"sky news": "center-right",
|
39 |
+
"the telegraph": "right",
|
40 |
+
"the times": "center-right",
|
41 |
+
"daily mail": "right",
|
42 |
+
"the independent": "center-left",
|
43 |
+
"the sun": "right",
|
44 |
+
"financial times": "center",
|
45 |
+
}
|
46 |
+
|
47 |
+
BIAS_SCORE_MAP = {
|
48 |
+
"left": -1,
|
49 |
+
"center-left": -0.5,
|
50 |
+
"center": 0,
|
51 |
+
"center-right": 0.5,
|
52 |
+
"right": 1,
|
53 |
+
"unknown": 0
|
54 |
+
}
|
55 |
+
|
56 |
+
def query(query, sort_by="popularity", max_tokens=100):
|
57 |
+
|
58 |
+
if query == "":
|
59 |
+
print("Topic needs to be passed in")
|
60 |
+
return
|
61 |
+
|
62 |
+
today = datetime.today()
|
63 |
+
seven_days_ago = today - timedelta(days=20)
|
64 |
+
from_date = seven_days_ago.strftime('%Y-%m-%d')
|
65 |
+
to_date = today.strftime('%Y-%m-%d')
|
66 |
+
|
67 |
+
base_url = "https://newsapi.org/v2/everything"
|
68 |
+
url = f"{base_url}?q={query}&from={from_date}&to={to_date}&sortBy={sort_by}&apiKey={api_key}"
|
69 |
+
news = None
|
70 |
+
|
71 |
+
try:
|
72 |
+
news_response = requests.get(url, timeout=10)
|
73 |
+
if news_response.status_code == 200:
|
74 |
+
news = news_response.json()
|
75 |
+
|
76 |
+
else:
|
77 |
+
print("API error has occured", news_response.status_code)
|
78 |
+
except Exception:
|
79 |
+
print('An exception occurred')
|
80 |
+
|
81 |
+
article_arr = news["articles"]
|
82 |
+
extracted_data = []
|
83 |
+
|
84 |
+
for article in article_arr:
|
85 |
+
extracted_data.append({
|
86 |
+
"title": article.get("title", "N/A"),
|
87 |
+
"description": article.get("description", "N/A"),
|
88 |
+
"source_name": article.get("source", {}).get("name", "N/A"),
|
89 |
+
"url": article.get("url", "N/A"),
|
90 |
+
"publishedAt": article.get("publishedAt", "N/A")
|
91 |
+
})
|
92 |
+
|
93 |
+
df = pd.DataFrame(extracted_data)
|
94 |
+
return df
|
95 |
+
|
96 |
+
|
97 |
+
def process_data(df):
|
98 |
+
df_cleaned = df.dropna(subset=["title", "description"])
|
99 |
+
df_cleaned = df_cleaned[df_cleaned["title"].str.strip() != ""]
|
100 |
+
df_cleaned = df_cleaned[df_cleaned["description"].str.strip() != ""]
|
101 |
+
df_cleaned = df_cleaned.drop_duplicates(subset=["title", "url"])
|
102 |
+
df_cleaned["text"] = df_cleaned["title"] + df_cleaned["description"].str.lower()
|
103 |
+
return df_cleaned
|
104 |
+
|
105 |
+
def analyse_sentiment(df):
|
106 |
+
|
107 |
+
analyser = SentimentIntensityAnalyzer()
|
108 |
+
|
109 |
+
df['compound'] = [analyser.polarity_scores(x)['compound'] for x in df['text']]
|
110 |
+
df['neg'] = [analyser.polarity_scores(x)['neg'] for x in df['text']]
|
111 |
+
df['neu'] = [analyser.polarity_scores(x)['neu'] for x in df['text']]
|
112 |
+
df['pos'] = [analyser.polarity_scores(x)['pos'] for x in df['text']]
|
113 |
+
|
114 |
+
def label_sentiment(score):
|
115 |
+
if score >= 0.05:
|
116 |
+
return "positive"
|
117 |
+
elif score <= -0.05:
|
118 |
+
return "negative"
|
119 |
+
else:
|
120 |
+
return "neutral"
|
121 |
+
|
122 |
+
df['sentiment_label'] = df['compound'].apply(label_sentiment)
|
123 |
+
return df
|
124 |
+
|
125 |
+
def get_bias_label(source_name):
|
126 |
+
source = source_name.strip().lower()
|
127 |
+
return SOURCE_BIAS_MAP.get(source, "unknown")
|
128 |
+
|
129 |
+
def add_bias_annotation(df):
|
130 |
+
df['bias_label'] = df['source_name'].apply(get_bias_label)
|
131 |
+
return df
|
132 |
+
|
133 |
+
def set_article_extremity(df, top_n=5):
|
134 |
+
def get_bias_extremity(label):
|
135 |
+
return BIAS_SCORE_MAP.get(label, 0)
|
136 |
+
|
137 |
+
df['bias_score'] = df['bias_label'].apply(get_bias_extremity)
|
138 |
+
|
139 |
+
df['extremity_score'] = df['compound'].abs() + df['bias_score'].abs()
|
140 |
+
|
141 |
+
df['extremity_pct'] = (df['extremity_score'] / 2) * 100
|
142 |
+
df['extremity_pct'] = df['extremity_pct'].round(1)
|
143 |
+
|
144 |
+
df = df.sort_values(by='extremity_score', ascending=False)
|
145 |
+
df['extreme'] = False
|
146 |
+
df.loc[df.index[:top_n], 'extreme'] = True
|
147 |
+
|
148 |
+
return df
|
149 |
+
|
150 |
+
def summarise_text(row, max_tokens=512):
|
151 |
+
try:
|
152 |
+
text = row['text'] if 'text' in row and pd.notna(row['text']) else ''
|
153 |
+
source_name = row['source_name'] if 'source_name' in row and pd.notna(row['source_name']) else 'unknown'
|
154 |
+
|
155 |
+
input_length = len(text.split())
|
156 |
+
|
157 |
+
if input_length < 40:
|
158 |
+
max_length = max(10, int(input_length / 2))
|
159 |
+
else:
|
160 |
+
max_length = min(input_length - 10, max_tokens)
|
161 |
+
min_length = max(10, max_length - 10)
|
162 |
+
|
163 |
+
summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
|
164 |
+
summary_text = summary[0]['summary_text']
|
165 |
+
|
166 |
+
bias_label = get_bias_label(source_name)
|
167 |
+
|
168 |
+
return pd.Series({
|
169 |
+
'summary': summary_text,
|
170 |
+
'bias_score': bias_label,
|
171 |
+
'source': source_name
|
172 |
+
})
|
173 |
+
|
174 |
+
except Exception as e:
|
175 |
+
print(f"Error summarising row: {e}")
|
176 |
+
return pd.Series({
|
177 |
+
'summary': 'Summary unavailable',
|
178 |
+
'bias_score': 'unknown',
|
179 |
+
'source': 'unknown'
|
180 |
+
})
|
181 |
+
|
182 |
+
def add_article_summaries(df, max_tokens=512):
|
183 |
+
summary_df = df.apply(summarise_text, axis=1, max_tokens=max_tokens)
|
184 |
+
df[['summary', 'bias_score', 'source']] = summary_df
|
185 |
+
return df
|
186 |
+
|
187 |
+
def main():
|
188 |
+
raw_df = query("Tesla")
|
189 |
+
processed_df = process_data(raw_df)
|
190 |
+
sentiment_df = analyse_sentiment(processed_df)
|
191 |
+
bias_df = add_bias_annotation(sentiment_df)
|
192 |
+
extremity_df = set_article_extremity(bias_df)
|
193 |
+
final_df = add_article_summaries(extremity_df)
|
194 |
+
|
195 |
+
if __name__ == "__main__":
|
196 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==24.1.0
|
2 |
+
annotated-types==0.7.0
|
3 |
+
anyio==4.9.0
|
4 |
+
appnope==0.1.4
|
5 |
+
argon2-cffi==23.1.0
|
6 |
+
argon2-cffi-bindings==21.2.0
|
7 |
+
arrow==1.3.0
|
8 |
+
asttokens==3.0.0
|
9 |
+
async-lru==2.0.4
|
10 |
+
attrs==25.1.0
|
11 |
+
babel==2.17.0
|
12 |
+
beautifulsoup4==4.13.3
|
13 |
+
bleach==6.2.0
|
14 |
+
certifi==2025.4.26
|
15 |
+
cffi==1.17.1
|
16 |
+
charset-normalizer==3.4.2
|
17 |
+
click==8.2.0
|
18 |
+
comm==0.2.2
|
19 |
+
debugpy==1.8.13
|
20 |
+
decorator==5.2.1
|
21 |
+
defusedxml==0.7.1
|
22 |
+
executing==2.2.0
|
23 |
+
fastapi==0.115.12
|
24 |
+
fastjsonschema==2.21.1
|
25 |
+
ffmpy==0.5.0
|
26 |
+
filelock==3.18.0
|
27 |
+
fqdn==1.5.1
|
28 |
+
fsspec==2025.3.2
|
29 |
+
gradio==5.29.0
|
30 |
+
gradio_client==1.10.0
|
31 |
+
groovy==0.1.2
|
32 |
+
h11==0.16.0
|
33 |
+
hf-xet==1.1.0
|
34 |
+
httpcore==1.0.9
|
35 |
+
httpx==0.28.1
|
36 |
+
huggingface-hub==0.31.1
|
37 |
+
idna==3.10
|
38 |
+
ipykernel==6.29.5
|
39 |
+
ipython==9.0.2
|
40 |
+
ipython_pygments_lexers==1.1.1
|
41 |
+
ipywidgets==8.1.5
|
42 |
+
isoduration==20.11.0
|
43 |
+
jedi==0.19.2
|
44 |
+
Jinja2==3.1.6
|
45 |
+
joblib==1.5.0
|
46 |
+
json5==0.10.0
|
47 |
+
jsonpointer==3.0.0
|
48 |
+
jsonschema==4.23.0
|
49 |
+
jsonschema-specifications==2024.10.1
|
50 |
+
jupyter==1.1.1
|
51 |
+
jupyter-console==6.6.3
|
52 |
+
jupyter-events==0.12.0
|
53 |
+
jupyter-lsp==2.2.5
|
54 |
+
jupyter_client==8.6.3
|
55 |
+
jupyter_core==5.7.2
|
56 |
+
jupyter_server==2.15.0
|
57 |
+
jupyter_server_terminals==0.5.3
|
58 |
+
jupyterlab==4.3.5
|
59 |
+
jupyterlab_pygments==0.3.0
|
60 |
+
jupyterlab_server==2.27.3
|
61 |
+
jupyterlab_widgets==3.0.13
|
62 |
+
markdown-it-py==3.0.0
|
63 |
+
MarkupSafe==3.0.2
|
64 |
+
matplotlib-inline==0.1.7
|
65 |
+
mdurl==0.1.2
|
66 |
+
mistune==3.1.2
|
67 |
+
mpmath==1.3.0
|
68 |
+
nbclient==0.10.2
|
69 |
+
nbconvert==7.16.6
|
70 |
+
nbformat==5.10.4
|
71 |
+
nest-asyncio==1.6.0
|
72 |
+
networkx==3.4.2
|
73 |
+
nltk==3.9.1
|
74 |
+
notebook==7.3.2
|
75 |
+
notebook_shim==0.2.4
|
76 |
+
numpy==2.2.5
|
77 |
+
orjson==3.10.18
|
78 |
+
overrides==7.7.0
|
79 |
+
packaging==25.0
|
80 |
+
pandas==2.2.3
|
81 |
+
pandocfilters==1.5.1
|
82 |
+
parso==0.8.4
|
83 |
+
pexpect==4.9.0
|
84 |
+
pillow==11.2.1
|
85 |
+
platformdirs==4.3.6
|
86 |
+
prometheus_client==0.21.1
|
87 |
+
prompt_toolkit==3.0.50
|
88 |
+
psutil==7.0.0
|
89 |
+
ptyprocess==0.7.0
|
90 |
+
pure_eval==0.2.3
|
91 |
+
pycparser==2.22
|
92 |
+
pydantic==2.11.4
|
93 |
+
pydantic_core==2.33.2
|
94 |
+
pydub==0.25.1
|
95 |
+
Pygments==2.19.1
|
96 |
+
python-dateutil==2.9.0.post0
|
97 |
+
python-dotenv==1.1.0
|
98 |
+
python-json-logger==3.3.0
|
99 |
+
python-multipart==0.0.20
|
100 |
+
pytz==2025.2
|
101 |
+
PyYAML==6.0.2
|
102 |
+
pyzmq==26.2.1
|
103 |
+
referencing==0.36.2
|
104 |
+
regex==2024.11.6
|
105 |
+
requests==2.32.3
|
106 |
+
rfc3339-validator==0.1.4
|
107 |
+
rfc3986-validator==0.1.1
|
108 |
+
rich==14.0.0
|
109 |
+
rpds-py==0.23.1
|
110 |
+
ruff==0.11.9
|
111 |
+
safehttpx==0.1.6
|
112 |
+
safetensors==0.5.3
|
113 |
+
scikit-learn==1.6.1
|
114 |
+
scipy==1.15.2
|
115 |
+
semantic-version==2.10.0
|
116 |
+
Send2Trash==1.8.3
|
117 |
+
shellingham==1.5.4
|
118 |
+
six==1.17.0
|
119 |
+
sniffio==1.3.1
|
120 |
+
soupsieve==2.6
|
121 |
+
stack-data==0.6.3
|
122 |
+
starlette==0.46.2
|
123 |
+
sympy==1.14.0
|
124 |
+
terminado==0.18.1
|
125 |
+
threadpoolctl==3.5.0
|
126 |
+
tinycss2==1.4.0
|
127 |
+
tokenizers==0.21.1
|
128 |
+
tomlkit==0.13.2
|
129 |
+
torch==2.7.0
|
130 |
+
tornado==6.4.2
|
131 |
+
tqdm==4.67.1
|
132 |
+
traitlets==5.14.3
|
133 |
+
transformers==4.51.3
|
134 |
+
typer==0.15.3
|
135 |
+
types-python-dateutil==2.9.0.20241206
|
136 |
+
typing-inspection==0.4.0
|
137 |
+
typing_extensions==4.13.2
|
138 |
+
tzdata==2025.2
|
139 |
+
uri-template==1.3.0
|
140 |
+
urllib3==2.4.0
|
141 |
+
uvicorn==0.34.2
|
142 |
+
wcwidth==0.2.13
|
143 |
+
webcolors==24.11.1
|
144 |
+
webencodings==0.5.1
|
145 |
+
websocket-client==1.8.0
|
146 |
+
websockets==15.0.1
|
147 |
+
widgetsnbextension==4.0.13
|