Sam Chaudry commited on
Commit
4d234fe
·
1 Parent(s): 1139f00

Initial commit with project files

Browse files
Files changed (3) hide show
  1. app.py +20 -0
  2. media_trust.py +196 -0
  3. requirements.txt +147 -0
app.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from media_trust import query, process_data, analyse_sentiment, add_bias_annotation, set_article_extremity, add_article_summaries
4
+
5
+ def process_news(topic):
6
+ raw_df = query(topic)
7
+ processed_df = process_data(raw_df)
8
+ sentiment_df = analyse_sentiment(processed_df)
9
+ bias_df = add_bias_annotation(sentiment_df)
10
+ extremity_df = set_article_extremity(bias_df)
11
+ final_df = add_article_summaries(extremity_df)
12
+ return final_df[['title', 'summary', 'bias_score', 'extremity_pct', 'source']]
13
+
14
+ with gr.Blocks() as interface:
15
+ with gr.Column():
16
+ topic_input = gr.Textbox(label="Enter a topic", placeholder="e.g., Tesla")
17
+ output_table = gr.DataFrame(headers=["Title", "Summary", "Bias", "Extremity %", "Source"], interactive=False)
18
+ topic_input.submit(process_news, inputs=topic_input, outputs=output_table)
19
+
20
+ interface.launch()
media_trust.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import pandas as pd
3
+ import gradio as gr
4
+ import datetime
5
+ import nltk
6
+ from datetime import datetime, timedelta
7
+ from nltk.sentiment.vader import SentimentIntensityAnalyzer
8
+ nltk.download('vader_lexicon')
9
+
10
+ from transformers import pipeline
11
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
12
+
13
+ from dotenv import load_dotenv
14
+ import os
15
+
16
+ load_dotenv()
17
+
18
+ api_key = os.getenv("API_KEY")
19
+
20
+ if not api_key:
21
+ raise ValueError("API_KEY not found. Make sure to set it in the .env file.")
22
+
23
+ SOURCE_BIAS_MAP = {
24
+ "fox news": "right",
25
+ "breitbart": "right",
26
+ "new york post": "right",
27
+ "the wall street journal": "center-right",
28
+ "reuters": "center",
29
+ "associated press": "center",
30
+ "bloomberg": "center",
31
+ "npr": "center-left",
32
+ "cnn": "left",
33
+ "msnbc": "left",
34
+ "the new york times": "left",
35
+ "the washington post": "left",
36
+ "the guardian": "left",
37
+ "bbc news": "center",
38
+ "sky news": "center-right",
39
+ "the telegraph": "right",
40
+ "the times": "center-right",
41
+ "daily mail": "right",
42
+ "the independent": "center-left",
43
+ "the sun": "right",
44
+ "financial times": "center",
45
+ }
46
+
47
+ BIAS_SCORE_MAP = {
48
+ "left": -1,
49
+ "center-left": -0.5,
50
+ "center": 0,
51
+ "center-right": 0.5,
52
+ "right": 1,
53
+ "unknown": 0
54
+ }
55
+
56
+ def query(query, sort_by="popularity", max_tokens=100):
57
+
58
+ if query == "":
59
+ print("Topic needs to be passed in")
60
+ return
61
+
62
+ today = datetime.today()
63
+ seven_days_ago = today - timedelta(days=20)
64
+ from_date = seven_days_ago.strftime('%Y-%m-%d')
65
+ to_date = today.strftime('%Y-%m-%d')
66
+
67
+ base_url = "https://newsapi.org/v2/everything"
68
+ url = f"{base_url}?q={query}&from={from_date}&to={to_date}&sortBy={sort_by}&apiKey={api_key}"
69
+ news = None
70
+
71
+ try:
72
+ news_response = requests.get(url, timeout=10)
73
+ if news_response.status_code == 200:
74
+ news = news_response.json()
75
+
76
+ else:
77
+ print("API error has occured", news_response.status_code)
78
+ except Exception:
79
+ print('An exception occurred')
80
+
81
+ article_arr = news["articles"]
82
+ extracted_data = []
83
+
84
+ for article in article_arr:
85
+ extracted_data.append({
86
+ "title": article.get("title", "N/A"),
87
+ "description": article.get("description", "N/A"),
88
+ "source_name": article.get("source", {}).get("name", "N/A"),
89
+ "url": article.get("url", "N/A"),
90
+ "publishedAt": article.get("publishedAt", "N/A")
91
+ })
92
+
93
+ df = pd.DataFrame(extracted_data)
94
+ return df
95
+
96
+
97
+ def process_data(df):
98
+ df_cleaned = df.dropna(subset=["title", "description"])
99
+ df_cleaned = df_cleaned[df_cleaned["title"].str.strip() != ""]
100
+ df_cleaned = df_cleaned[df_cleaned["description"].str.strip() != ""]
101
+ df_cleaned = df_cleaned.drop_duplicates(subset=["title", "url"])
102
+ df_cleaned["text"] = df_cleaned["title"] + df_cleaned["description"].str.lower()
103
+ return df_cleaned
104
+
105
+ def analyse_sentiment(df):
106
+
107
+ analyser = SentimentIntensityAnalyzer()
108
+
109
+ df['compound'] = [analyser.polarity_scores(x)['compound'] for x in df['text']]
110
+ df['neg'] = [analyser.polarity_scores(x)['neg'] for x in df['text']]
111
+ df['neu'] = [analyser.polarity_scores(x)['neu'] for x in df['text']]
112
+ df['pos'] = [analyser.polarity_scores(x)['pos'] for x in df['text']]
113
+
114
+ def label_sentiment(score):
115
+ if score >= 0.05:
116
+ return "positive"
117
+ elif score <= -0.05:
118
+ return "negative"
119
+ else:
120
+ return "neutral"
121
+
122
+ df['sentiment_label'] = df['compound'].apply(label_sentiment)
123
+ return df
124
+
125
+ def get_bias_label(source_name):
126
+ source = source_name.strip().lower()
127
+ return SOURCE_BIAS_MAP.get(source, "unknown")
128
+
129
+ def add_bias_annotation(df):
130
+ df['bias_label'] = df['source_name'].apply(get_bias_label)
131
+ return df
132
+
133
+ def set_article_extremity(df, top_n=5):
134
+ def get_bias_extremity(label):
135
+ return BIAS_SCORE_MAP.get(label, 0)
136
+
137
+ df['bias_score'] = df['bias_label'].apply(get_bias_extremity)
138
+
139
+ df['extremity_score'] = df['compound'].abs() + df['bias_score'].abs()
140
+
141
+ df['extremity_pct'] = (df['extremity_score'] / 2) * 100
142
+ df['extremity_pct'] = df['extremity_pct'].round(1)
143
+
144
+ df = df.sort_values(by='extremity_score', ascending=False)
145
+ df['extreme'] = False
146
+ df.loc[df.index[:top_n], 'extreme'] = True
147
+
148
+ return df
149
+
150
+ def summarise_text(row, max_tokens=512):
151
+ try:
152
+ text = row['text'] if 'text' in row and pd.notna(row['text']) else ''
153
+ source_name = row['source_name'] if 'source_name' in row and pd.notna(row['source_name']) else 'unknown'
154
+
155
+ input_length = len(text.split())
156
+
157
+ if input_length < 40:
158
+ max_length = max(10, int(input_length / 2))
159
+ else:
160
+ max_length = min(input_length - 10, max_tokens)
161
+ min_length = max(10, max_length - 10)
162
+
163
+ summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
164
+ summary_text = summary[0]['summary_text']
165
+
166
+ bias_label = get_bias_label(source_name)
167
+
168
+ return pd.Series({
169
+ 'summary': summary_text,
170
+ 'bias_score': bias_label,
171
+ 'source': source_name
172
+ })
173
+
174
+ except Exception as e:
175
+ print(f"Error summarising row: {e}")
176
+ return pd.Series({
177
+ 'summary': 'Summary unavailable',
178
+ 'bias_score': 'unknown',
179
+ 'source': 'unknown'
180
+ })
181
+
182
+ def add_article_summaries(df, max_tokens=512):
183
+ summary_df = df.apply(summarise_text, axis=1, max_tokens=max_tokens)
184
+ df[['summary', 'bias_score', 'source']] = summary_df
185
+ return df
186
+
187
+ def main():
188
+ raw_df = query("Tesla")
189
+ processed_df = process_data(raw_df)
190
+ sentiment_df = analyse_sentiment(processed_df)
191
+ bias_df = add_bias_annotation(sentiment_df)
192
+ extremity_df = set_article_extremity(bias_df)
193
+ final_df = add_article_summaries(extremity_df)
194
+
195
+ if __name__ == "__main__":
196
+ main()
requirements.txt ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ annotated-types==0.7.0
3
+ anyio==4.9.0
4
+ appnope==0.1.4
5
+ argon2-cffi==23.1.0
6
+ argon2-cffi-bindings==21.2.0
7
+ arrow==1.3.0
8
+ asttokens==3.0.0
9
+ async-lru==2.0.4
10
+ attrs==25.1.0
11
+ babel==2.17.0
12
+ beautifulsoup4==4.13.3
13
+ bleach==6.2.0
14
+ certifi==2025.4.26
15
+ cffi==1.17.1
16
+ charset-normalizer==3.4.2
17
+ click==8.2.0
18
+ comm==0.2.2
19
+ debugpy==1.8.13
20
+ decorator==5.2.1
21
+ defusedxml==0.7.1
22
+ executing==2.2.0
23
+ fastapi==0.115.12
24
+ fastjsonschema==2.21.1
25
+ ffmpy==0.5.0
26
+ filelock==3.18.0
27
+ fqdn==1.5.1
28
+ fsspec==2025.3.2
29
+ gradio==5.29.0
30
+ gradio_client==1.10.0
31
+ groovy==0.1.2
32
+ h11==0.16.0
33
+ hf-xet==1.1.0
34
+ httpcore==1.0.9
35
+ httpx==0.28.1
36
+ huggingface-hub==0.31.1
37
+ idna==3.10
38
+ ipykernel==6.29.5
39
+ ipython==9.0.2
40
+ ipython_pygments_lexers==1.1.1
41
+ ipywidgets==8.1.5
42
+ isoduration==20.11.0
43
+ jedi==0.19.2
44
+ Jinja2==3.1.6
45
+ joblib==1.5.0
46
+ json5==0.10.0
47
+ jsonpointer==3.0.0
48
+ jsonschema==4.23.0
49
+ jsonschema-specifications==2024.10.1
50
+ jupyter==1.1.1
51
+ jupyter-console==6.6.3
52
+ jupyter-events==0.12.0
53
+ jupyter-lsp==2.2.5
54
+ jupyter_client==8.6.3
55
+ jupyter_core==5.7.2
56
+ jupyter_server==2.15.0
57
+ jupyter_server_terminals==0.5.3
58
+ jupyterlab==4.3.5
59
+ jupyterlab_pygments==0.3.0
60
+ jupyterlab_server==2.27.3
61
+ jupyterlab_widgets==3.0.13
62
+ markdown-it-py==3.0.0
63
+ MarkupSafe==3.0.2
64
+ matplotlib-inline==0.1.7
65
+ mdurl==0.1.2
66
+ mistune==3.1.2
67
+ mpmath==1.3.0
68
+ nbclient==0.10.2
69
+ nbconvert==7.16.6
70
+ nbformat==5.10.4
71
+ nest-asyncio==1.6.0
72
+ networkx==3.4.2
73
+ nltk==3.9.1
74
+ notebook==7.3.2
75
+ notebook_shim==0.2.4
76
+ numpy==2.2.5
77
+ orjson==3.10.18
78
+ overrides==7.7.0
79
+ packaging==25.0
80
+ pandas==2.2.3
81
+ pandocfilters==1.5.1
82
+ parso==0.8.4
83
+ pexpect==4.9.0
84
+ pillow==11.2.1
85
+ platformdirs==4.3.6
86
+ prometheus_client==0.21.1
87
+ prompt_toolkit==3.0.50
88
+ psutil==7.0.0
89
+ ptyprocess==0.7.0
90
+ pure_eval==0.2.3
91
+ pycparser==2.22
92
+ pydantic==2.11.4
93
+ pydantic_core==2.33.2
94
+ pydub==0.25.1
95
+ Pygments==2.19.1
96
+ python-dateutil==2.9.0.post0
97
+ python-dotenv==1.1.0
98
+ python-json-logger==3.3.0
99
+ python-multipart==0.0.20
100
+ pytz==2025.2
101
+ PyYAML==6.0.2
102
+ pyzmq==26.2.1
103
+ referencing==0.36.2
104
+ regex==2024.11.6
105
+ requests==2.32.3
106
+ rfc3339-validator==0.1.4
107
+ rfc3986-validator==0.1.1
108
+ rich==14.0.0
109
+ rpds-py==0.23.1
110
+ ruff==0.11.9
111
+ safehttpx==0.1.6
112
+ safetensors==0.5.3
113
+ scikit-learn==1.6.1
114
+ scipy==1.15.2
115
+ semantic-version==2.10.0
116
+ Send2Trash==1.8.3
117
+ shellingham==1.5.4
118
+ six==1.17.0
119
+ sniffio==1.3.1
120
+ soupsieve==2.6
121
+ stack-data==0.6.3
122
+ starlette==0.46.2
123
+ sympy==1.14.0
124
+ terminado==0.18.1
125
+ threadpoolctl==3.5.0
126
+ tinycss2==1.4.0
127
+ tokenizers==0.21.1
128
+ tomlkit==0.13.2
129
+ torch==2.7.0
130
+ tornado==6.4.2
131
+ tqdm==4.67.1
132
+ traitlets==5.14.3
133
+ transformers==4.51.3
134
+ typer==0.15.3
135
+ types-python-dateutil==2.9.0.20241206
136
+ typing-inspection==0.4.0
137
+ typing_extensions==4.13.2
138
+ tzdata==2025.2
139
+ uri-template==1.3.0
140
+ urllib3==2.4.0
141
+ uvicorn==0.34.2
142
+ wcwidth==0.2.13
143
+ webcolors==24.11.1
144
+ webencodings==0.5.1
145
+ websocket-client==1.8.0
146
+ websockets==15.0.1
147
+ widgetsnbextension==4.0.13