Spaces:
Sleeping
Sleeping
Jason Wu
commited on
Commit
·
faa1d38
1
Parent(s):
636549a
finish
Browse files
app.py
CHANGED
@@ -1,73 +1,250 @@
|
|
1 |
-
# INSTRUCTIONS:
|
2 |
-
# 1. Open a "Terminal" by: View --> Terminal OR just the "Terminal" through the hamburger menu
|
3 |
-
# 2. run in terminal with: streamlit run app.py
|
4 |
-
# 3. click the "Open in Browser" link that pops up OR click on "Ports" and copy the URL
|
5 |
-
# 4. Open a Simple Browswer with View --> Command Palette --> Simple Browser: Show
|
6 |
-
# 5. use the URL from prior steps as intput into this simple browser
|
7 |
-
|
8 |
-
|
9 |
import streamlit as st
|
10 |
import altair as alt
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
st.text("The URL for this app is: https://huggingface.co/spaces/jwu249/is445_demo")
|
16 |
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
)
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
click = alt.selection_point(encodings=["color"])
|
30 |
-
|
31 |
-
# Top panel is scatter plot of temperature vs time
|
32 |
-
points = (
|
33 |
-
alt.Chart()
|
34 |
-
.mark_point()
|
35 |
-
.encode(
|
36 |
-
alt.X("monthdate(date):T", title="Date (Month Year)"),
|
37 |
-
alt.Y(
|
38 |
-
"temp_max:Q",
|
39 |
-
title="Maximum Daily Temperature (C)",
|
40 |
-
scale=alt.Scale(domain=[-5, 40]),
|
41 |
-
),
|
42 |
-
color=alt.condition(brush, color, alt.value("lightgray")),
|
43 |
-
size=alt.Size("precipitation:Q", scale=alt.Scale(range=[5, 200])),
|
44 |
-
)
|
45 |
-
.properties(width=550, height=300)
|
46 |
-
.add_params(brush)
|
47 |
-
.transform_filter(click)
|
48 |
)
|
49 |
|
50 |
-
#
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
)
|
63 |
-
.
|
64 |
)
|
65 |
|
66 |
-
chart = alt.vconcat(points, bars, data=source, title="Seattle Weather - 2002 to 2012")
|
67 |
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
-
with
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import altair as alt
|
3 |
+
import pandas as pd
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import numpy as np
|
6 |
+
import re
|
7 |
+
from tqdm.notebook import tqdm
|
8 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
9 |
+
from scipy.special import softmax
|
10 |
+
|
11 |
+
# pip install torch torchvision torchaudio
|
12 |
+
# pip install transformers
|
13 |
+
|
14 |
+
st.title("Final Project Part 2 - jwu249 | Expert Visualizations")
|
15 |
+
|
16 |
+
url = "https://www.kaggle.com/datasets/rahulgoel1106/xenophobia-on-twitter-during-covid19"
|
17 |
+
st.write("Dataset Link to Download -> [Kaggle Covid-19 Xenophobic Datatset](%s)" % url)
|
18 |
+
|
19 |
+
plt.style.use('ggplot')
|
20 |
+
|
21 |
+
multi = '''The dataset chosen is called Xenophobic and like the name it highlights the Xenophobic posts on Twitter
|
22 |
+
during beginning stages of Covid-19 and today we are conducting sentiment analysis using a trained Twitter sentiment model.
|
23 |
+
#### To follow: '''
|
24 |
+
st.markdown('''### About:''')
|
25 |
+
st.markdown(multi)
|
26 |
+
st.code('''# pip install these packages into your terminal or workspace
|
27 |
+
pip install torch torchvision torchaudio # to work with the trained model
|
28 |
+
pip install transformers # to work with the trained model''')
|
29 |
+
|
30 |
+
df = pd.read_csv('Xenophobia.csv', encoding='latin1', nrows=5000)
|
31 |
+
cols_to_drop = ['status_id', 'created_at', 'location']
|
32 |
+
df.drop(cols_to_drop, axis=1, inplace=True)
|
33 |
+
|
34 |
+
# Convert text to string type
|
35 |
+
df['text'] = df['text'].astype(str)
|
36 |
+
|
37 |
+
st.markdown('''#### Leading Data & Removing Unwated Data: ''')
|
38 |
+
st.code('''df = pd.read_csv('Xenophobia.csv', encoding='latin1', nrows=5000)
|
39 |
+
cols_to_drop = ['status_id', 'created_at', 'location'] ''')
|
40 |
+
|
41 |
+
multi1 = ''' #### Next Steps:
|
42 |
+
The next step is to run the sentiment analysis on the dataset, however the analysis take a long time to run so I am only going to test 5000 rows out of the millions of rows.
|
43 |
+
1. The first step is to intialize the model and call on it from HuggingFace'''
|
44 |
+
|
45 |
+
st.markdown(multi1)
|
46 |
+
st.code('''MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
|
47 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
48 |
+
model = AutoModelForSequenceClassification.from_pretrained(MODEL)''')
|
49 |
+
|
50 |
+
st.markdown('''2. Then we have to clean the data from stop words''')
|
51 |
+
st.code(''' def clean_text(text):
|
52 |
+
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
|
53 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
54 |
+
return text''')
|
55 |
+
st.markdown('''3. Lastly, we will have to run the model on the cleaned data and process the sentiment scores into a new csv file to be used''')
|
56 |
+
|
57 |
+
st.code('''def examine_text(example):
|
58 |
+
try: # Use the try statement to handle errors such as text being too long etc
|
59 |
+
encoded_text = tokenizer( # setting conditions
|
60 |
+
example,
|
61 |
+
return_tensors='pt',
|
62 |
+
truncation=True,
|
63 |
+
max_length=512,
|
64 |
+
padding="max_length"
|
65 |
+
)
|
66 |
+
output = model(**encoded_text)
|
67 |
+
scores = output.logits[0].detach().numpy()
|
68 |
+
scores = softmax(scores) # softmax function transforms each element of a collection by computing the exponential of each element divided by the sum of the exponentials of all the elements
|
69 |
+
return {
|
70 |
+
'neg': scores[0],
|
71 |
+
'neu': scores[1],
|
72 |
+
'pos': scores[2]
|
73 |
+
}
|
74 |
+
except Exception as e: # handling errors
|
75 |
+
print(f"Error processing text: {example}\nError: {e}")
|
76 |
+
return None ''')
|
77 |
+
|
78 |
+
st.code('''results = []
|
79 |
+
|
80 |
+
# Process each text
|
81 |
+
for i, row in tqdm(df.iterrows(), total=len(df)):
|
82 |
+
text = clean_text(row['text'])
|
83 |
+
scores = examine_text(text)
|
84 |
+
if scores:
|
85 |
+
# Append scores to results
|
86 |
+
results.append({'index': i, 'neg': scores['neg'], 'neu': scores['neu'], 'pos': scores['pos']})
|
87 |
+
else:
|
88 |
+
print(f"Skipped problematic text: {text}")
|
89 |
+
|
90 |
+
# Convert results to a DataFrame
|
91 |
+
results_df = pd.DataFrame(results)
|
92 |
+
|
93 |
+
# Save to CSV
|
94 |
+
results_df.to_csv('sentiment_scores.csv', index=False)
|
95 |
+
print("Saved sentiment scores to 'sentiment_scores.csv'")
|
96 |
+
# prints out when done - took me 20 minutes for 5000 rows so imagine a million''')
|
97 |
+
# MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
|
98 |
+
# tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
99 |
+
# model = AutoModelForSequenceClassification.from_pretrained(MODEL)
|
100 |
+
|
101 |
+
# # Clean Text Function
|
102 |
+
# def clean_text(text):
|
103 |
+
# """Remove non-ASCII characters and excess whitespace."""
|
104 |
+
# text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Remove non-ASCII
|
105 |
+
# text = re.sub(r'\s+', ' ', text).strip() # Remove excess whitespace
|
106 |
+
# return text
|
107 |
|
|
|
108 |
|
109 |
+
# Sentiment analysis function
|
110 |
+
# def examine_text(example):
|
111 |
+
# try:
|
112 |
+
# encoded_text = tokenizer(
|
113 |
+
# example,
|
114 |
+
# return_tensors='pt',
|
115 |
+
# truncation=True,
|
116 |
+
# max_length=512,
|
117 |
+
# padding="max_length"
|
118 |
+
# )
|
119 |
+
# output = model(**encoded_text)
|
120 |
+
# scores = output.logits[0].detach().numpy()
|
121 |
+
# scores = softmax(scores)
|
122 |
+
# return {
|
123 |
+
# 'neg': scores[0],
|
124 |
+
# 'neu': scores[1],
|
125 |
+
# 'pos': scores[2]
|
126 |
+
# }
|
127 |
+
# except Exception as e:
|
128 |
+
# print(f"Error processing text: {example}\nError: {e}")
|
129 |
+
# return None
|
130 |
|
131 |
+
|
132 |
+
# Prepare list for saving results
|
133 |
+
# results = []
|
134 |
+
|
135 |
+
# # Process each text
|
136 |
+
# for i, row in tqdm(df.iterrows(), total=len(df)):
|
137 |
+
# text = clean_text(row['text'])
|
138 |
+
# scores = examine_text(text)
|
139 |
+
# if scores:
|
140 |
+
# # Append scores to results
|
141 |
+
# results.append({'index': i, 'neg': scores['neg'], 'neu': scores['neu'], 'pos': scores['pos']})
|
142 |
+
# else:
|
143 |
+
# print(f"Skipped problematic text: {text}")
|
144 |
+
|
145 |
+
# # Convert results to a DataFrame
|
146 |
+
# results_df = pd.DataFrame(results)
|
147 |
+
|
148 |
+
# # Save to CSV
|
149 |
+
# results_df.to_csv('sentiment_scores.csv', index=False)
|
150 |
+
# print("Saved sentiment scores to 'sentiment_scores.csv'")
|
151 |
+
|
152 |
+
|
153 |
+
st.markdown(''' ### Plotting in Altair
|
154 |
+
We then just load in the data from the csv file with the sentiment scores and create plots with them''')
|
155 |
+
st.code('''# Load sentiment scores
|
156 |
+
sentiment_scores = pd.read_csv('sentiment_scores.csv')
|
157 |
+
df = df.reset_index().merge(sentiment_scores, on='index')''')
|
158 |
+
|
159 |
+
# Load sentiment scores
|
160 |
+
sentiment_scores = pd.read_csv('sentiment_scores.csv')
|
161 |
+
df = df.reset_index().merge(sentiment_scores, on='index')
|
162 |
+
|
163 |
+
# Clean text function
|
164 |
+
def clean_text(text):
|
165 |
+
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
|
166 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
167 |
+
return text
|
168 |
+
|
169 |
+
df['cleaned_text'] = df['text'].apply(clean_text)
|
170 |
+
|
171 |
+
# Determine the highest sentiment score for each row
|
172 |
+
df['highest_score'] = df[['neg', 'neu', 'pos']].max(axis=1)
|
173 |
+
df['sentiment_type'] = df[['neg', 'neu', 'pos']].idxmax(axis=1) # neg/neu/pos as categories
|
174 |
+
df['sentiment_type'] = df['sentiment_type'].replace({
|
175 |
+
'neg': 'Negative',
|
176 |
+
'neu': 'Neutral',
|
177 |
+
'pos': 'Positive'
|
178 |
+
})
|
179 |
+
|
180 |
+
# Sidebar: Filters
|
181 |
+
st.sidebar.header("Filters")
|
182 |
+
sentiment_filter = st.sidebar.multiselect(
|
183 |
+
"Select Sentiment Types to Display:",
|
184 |
+
options=['Negative', 'Neutral', 'Positive'],
|
185 |
+
default=['Negative', 'Neutral', 'Positive']
|
186 |
)
|
187 |
+
score_filter = st.sidebar.slider(
|
188 |
+
"Select Minimum Sentiment Score:",
|
189 |
+
min_value=0.0000,
|
190 |
+
max_value=1.0000,
|
191 |
+
value=0.0000,
|
192 |
+
step=0.0001
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
)
|
194 |
|
195 |
+
# Filter the DataFrame to only include points that meet criteria
|
196 |
+
filtered_df = df[
|
197 |
+
(df['sentiment_type'].isin(sentiment_filter)) & # Match selected sentiment type
|
198 |
+
(df['highest_score'] >= score_filter) # Match slider score range
|
199 |
+
]
|
200 |
+
|
201 |
+
filtered_counts = filtered_df['sentiment_type'].value_counts()
|
202 |
+
|
203 |
+
# Generate a summary message for the counts
|
204 |
+
filtered_summary = (
|
205 |
+
f"**Filtered DataFrame:**\n"
|
206 |
+
f"- **Negative Sentiments Count:** {filtered_counts.get('Negative', 0)}\n"
|
207 |
+
f"- **Neutral Sentiments Count:** {filtered_counts.get('Neutral', 0)}\n"
|
208 |
+
f"- **Positive Sentiments Count:** {filtered_counts.get('Positive', 0)}"
|
209 |
)
|
210 |
|
|
|
211 |
|
212 |
+
# Display the scatter plot
|
213 |
+
scatter_plot = alt.Chart(filtered_df).mark_circle(size=60).encode(
|
214 |
+
x=alt.X('index:Q', title='Index'),
|
215 |
+
y=alt.Y('highest_score:Q', title='Highest Sentiment Score'),
|
216 |
+
color=alt.Color('sentiment_type:N', title='Sentiment Type', scale=alt.Scale(scheme='tableau20')),
|
217 |
+
tooltip=['index', 'sentiment_type', 'highest_score', 'cleaned_text', 'text']
|
218 |
+
).properties(
|
219 |
+
width=800,
|
220 |
+
height=400,
|
221 |
+
title="Scatter Plot of Sentiment Scores (Filtered)"
|
222 |
+
).interactive()
|
223 |
+
|
224 |
+
# Display the scatter plot
|
225 |
+
st.altair_chart(scatter_plot, use_container_width=True)
|
226 |
+
|
227 |
+
# Display the filtered DataFrame and counts
|
228 |
+
st.write(filtered_summary)
|
229 |
+
st.dataframe(
|
230 |
+
filtered_df[['sentiment_type', 'cleaned_text', 'highest_score', 'text']]
|
231 |
+
)
|
232 |
+
|
233 |
+
st.header('''Write Up''')
|
234 |
+
multi2 = '''Mentioned in the beginning, as a Asian American I wanted to highlight the xenophobic tweets during
|
235 |
+
Covid-19 and using a trained sentiment analysis model to analyze and visualize the tweets was a instant idea
|
236 |
+
when I found the dataset. In the first plot, I decided to use an scatter plot to better visualize all the tweets with the highest sentitment score plotted.
|
237 |
+
So to do that, I had to compare the negative, neutral, and positive scores and find the highest one. Then using the filters/interactivity, depending on which ones you selected.
|
238 |
+
The dataframe and the scatter plot will update accordingly. As an additional layer, on the scatter plot I wanted it to be efficient in comparing the sentiments so the data related
|
239 |
+
to the point will appear when hovered over.
|
240 |
+
|
241 |
+
For the second plot, I wanted for the expert to easily view the dataframe and use it as a secondary reference to the scatter as a better view to make insights because in a table format
|
242 |
+
, you're able to see all the columns better. Additionally, I adjusted the columns orders to make it efficient as possible. As a additional layer, there is a count for how many of the
|
243 |
+
data points are showing and it is updated according to the filters.
|
244 |
+
|
245 |
+
For the interactivity, I wanted to have two types of filters with the multiselect first allowing the expert to easily manage the points, and then filter the points again with the sentiment scores.
|
246 |
|
247 |
+
Overall, I am happy with the plots but if I were to have more time, I would definitly load and analyze more data points which is an easy task to do by just changing the number of rows to parse over and
|
248 |
+
afking while letting the program run. However, I just don't know how long it will take as I already explored options like the tqdm module to add a progress bar but it doesn't really work in the terminal locally for me.
|
249 |
+
'''
|
250 |
+
st.markdown(multi2)
|