Jason Wu commited on
Commit
faa1d38
·
1 Parent(s): 636549a
Files changed (1) hide show
  1. app.py +238 -61
app.py CHANGED
@@ -1,73 +1,250 @@
1
- # INSTRUCTIONS:
2
- # 1. Open a "Terminal" by: View --> Terminal OR just the "Terminal" through the hamburger menu
3
- # 2. run in terminal with: streamlit run app.py
4
- # 3. click the "Open in Browser" link that pops up OR click on "Ports" and copy the URL
5
- # 4. Open a Simple Browswer with View --> Command Palette --> Simple Browser: Show
6
- # 5. use the URL from prior steps as intput into this simple browser
7
-
8
-
9
  import streamlit as st
10
  import altair as alt
11
- from vega_datasets import data
12
-
13
- st.title('Streamlit App for IS445: ID47122')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- st.text("The URL for this app is: https://huggingface.co/spaces/jwu249/is445_demo")
16
 
17
- source = data.seattle_weather()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- scale = alt.Scale(
20
- domain=["sun", "fog", "drizzle", "rain", "snow"],
21
- range=["#e7ba52", "#a7a7a7", "#aec7e8", "#1f77b4", "#9467bd"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  )
23
- color = alt.Color("weather:N", scale=scale)
24
-
25
- # We create two selections:
26
- # - a brush that is active on the top panel
27
- # - a multi-click that is active on the bottom panel
28
- brush = alt.selection_interval(encodings=["x"])
29
- click = alt.selection_point(encodings=["color"])
30
-
31
- # Top panel is scatter plot of temperature vs time
32
- points = (
33
- alt.Chart()
34
- .mark_point()
35
- .encode(
36
- alt.X("monthdate(date):T", title="Date (Month Year)"),
37
- alt.Y(
38
- "temp_max:Q",
39
- title="Maximum Daily Temperature (C)",
40
- scale=alt.Scale(domain=[-5, 40]),
41
- ),
42
- color=alt.condition(brush, color, alt.value("lightgray")),
43
- size=alt.Size("precipitation:Q", scale=alt.Scale(range=[5, 200])),
44
- )
45
- .properties(width=550, height=300)
46
- .add_params(brush)
47
- .transform_filter(click)
48
  )
49
 
50
- # Bottom panel is a bar chart of weather type
51
- bars = (
52
- alt.Chart()
53
- .mark_bar()
54
- .encode(
55
- x="count()",
56
- y="weather:N",
57
- color=alt.condition(click, color, alt.value("lightgray")),
58
- )
59
- .transform_filter(brush)
60
- .properties(
61
- width=550,
62
- )
63
- .add_params(click)
64
  )
65
 
66
- chart = alt.vconcat(points, bars, data=source, title="Seattle Weather - 2002 to 2012")
67
 
68
- tab1, tab2 = st.tabs(["Streamlit theme (default)", "Altair native theme"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- with tab1:
71
- st.altair_chart(chart, theme="streamlit", use_container_width=True)
72
- with tab2:
73
- st.altair_chart(chart, theme=None, use_container_width=True)
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import altair as alt
3
+ import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ import re
7
+ from tqdm.notebook import tqdm
8
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
9
+ from scipy.special import softmax
10
+
11
+ # pip install torch torchvision torchaudio
12
+ # pip install transformers
13
+
14
+ st.title("Final Project Part 2 - jwu249 | Expert Visualizations")
15
+
16
+ url = "https://www.kaggle.com/datasets/rahulgoel1106/xenophobia-on-twitter-during-covid19"
17
+ st.write("Dataset Link to Download -> [Kaggle Covid-19 Xenophobic Datatset](%s)" % url)
18
+
19
+ plt.style.use('ggplot')
20
+
21
+ multi = '''The dataset chosen is called Xenophobic and like the name it highlights the Xenophobic posts on Twitter
22
+ during beginning stages of Covid-19 and today we are conducting sentiment analysis using a trained Twitter sentiment model.
23
+ #### To follow: '''
24
+ st.markdown('''### About:''')
25
+ st.markdown(multi)
26
+ st.code('''# pip install these packages into your terminal or workspace
27
+ pip install torch torchvision torchaudio # to work with the trained model
28
+ pip install transformers # to work with the trained model''')
29
+
30
+ df = pd.read_csv('Xenophobia.csv', encoding='latin1', nrows=5000)
31
+ cols_to_drop = ['status_id', 'created_at', 'location']
32
+ df.drop(cols_to_drop, axis=1, inplace=True)
33
+
34
+ # Convert text to string type
35
+ df['text'] = df['text'].astype(str)
36
+
37
+ st.markdown('''#### Leading Data & Removing Unwated Data: ''')
38
+ st.code('''df = pd.read_csv('Xenophobia.csv', encoding='latin1', nrows=5000)
39
+ cols_to_drop = ['status_id', 'created_at', 'location'] ''')
40
+
41
+ multi1 = ''' #### Next Steps:
42
+ The next step is to run the sentiment analysis on the dataset, however the analysis take a long time to run so I am only going to test 5000 rows out of the millions of rows.
43
+ 1. The first step is to intialize the model and call on it from HuggingFace'''
44
+
45
+ st.markdown(multi1)
46
+ st.code('''MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
47
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
48
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL)''')
49
+
50
+ st.markdown('''2. Then we have to clean the data from stop words''')
51
+ st.code(''' def clean_text(text):
52
+ text = re.sub(r'[^\x00-\x7F]+', ' ', text)
53
+ text = re.sub(r'\s+', ' ', text).strip()
54
+ return text''')
55
+ st.markdown('''3. Lastly, we will have to run the model on the cleaned data and process the sentiment scores into a new csv file to be used''')
56
+
57
+ st.code('''def examine_text(example):
58
+ try: # Use the try statement to handle errors such as text being too long etc
59
+ encoded_text = tokenizer( # setting conditions
60
+ example,
61
+ return_tensors='pt',
62
+ truncation=True,
63
+ max_length=512,
64
+ padding="max_length"
65
+ )
66
+ output = model(**encoded_text)
67
+ scores = output.logits[0].detach().numpy()
68
+ scores = softmax(scores) # softmax function transforms each element of a collection by computing the exponential of each element divided by the sum of the exponentials of all the elements
69
+ return {
70
+ 'neg': scores[0],
71
+ 'neu': scores[1],
72
+ 'pos': scores[2]
73
+ }
74
+ except Exception as e: # handling errors
75
+ print(f"Error processing text: {example}\nError: {e}")
76
+ return None ''')
77
+
78
+ st.code('''results = []
79
+
80
+ # Process each text
81
+ for i, row in tqdm(df.iterrows(), total=len(df)):
82
+ text = clean_text(row['text'])
83
+ scores = examine_text(text)
84
+ if scores:
85
+ # Append scores to results
86
+ results.append({'index': i, 'neg': scores['neg'], 'neu': scores['neu'], 'pos': scores['pos']})
87
+ else:
88
+ print(f"Skipped problematic text: {text}")
89
+
90
+ # Convert results to a DataFrame
91
+ results_df = pd.DataFrame(results)
92
+
93
+ # Save to CSV
94
+ results_df.to_csv('sentiment_scores.csv', index=False)
95
+ print("Saved sentiment scores to 'sentiment_scores.csv'")
96
+ # prints out when done - took me 20 minutes for 5000 rows so imagine a million''')
97
+ # MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
98
+ # tokenizer = AutoTokenizer.from_pretrained(MODEL)
99
+ # model = AutoModelForSequenceClassification.from_pretrained(MODEL)
100
+
101
+ # # Clean Text Function
102
+ # def clean_text(text):
103
+ # """Remove non-ASCII characters and excess whitespace."""
104
+ # text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Remove non-ASCII
105
+ # text = re.sub(r'\s+', ' ', text).strip() # Remove excess whitespace
106
+ # return text
107
 
 
108
 
109
+ # Sentiment analysis function
110
+ # def examine_text(example):
111
+ # try:
112
+ # encoded_text = tokenizer(
113
+ # example,
114
+ # return_tensors='pt',
115
+ # truncation=True,
116
+ # max_length=512,
117
+ # padding="max_length"
118
+ # )
119
+ # output = model(**encoded_text)
120
+ # scores = output.logits[0].detach().numpy()
121
+ # scores = softmax(scores)
122
+ # return {
123
+ # 'neg': scores[0],
124
+ # 'neu': scores[1],
125
+ # 'pos': scores[2]
126
+ # }
127
+ # except Exception as e:
128
+ # print(f"Error processing text: {example}\nError: {e}")
129
+ # return None
130
 
131
+
132
+ # Prepare list for saving results
133
+ # results = []
134
+
135
+ # # Process each text
136
+ # for i, row in tqdm(df.iterrows(), total=len(df)):
137
+ # text = clean_text(row['text'])
138
+ # scores = examine_text(text)
139
+ # if scores:
140
+ # # Append scores to results
141
+ # results.append({'index': i, 'neg': scores['neg'], 'neu': scores['neu'], 'pos': scores['pos']})
142
+ # else:
143
+ # print(f"Skipped problematic text: {text}")
144
+
145
+ # # Convert results to a DataFrame
146
+ # results_df = pd.DataFrame(results)
147
+
148
+ # # Save to CSV
149
+ # results_df.to_csv('sentiment_scores.csv', index=False)
150
+ # print("Saved sentiment scores to 'sentiment_scores.csv'")
151
+
152
+
153
+ st.markdown(''' ### Plotting in Altair
154
+ We then just load in the data from the csv file with the sentiment scores and create plots with them''')
155
+ st.code('''# Load sentiment scores
156
+ sentiment_scores = pd.read_csv('sentiment_scores.csv')
157
+ df = df.reset_index().merge(sentiment_scores, on='index')''')
158
+
159
+ # Load sentiment scores
160
+ sentiment_scores = pd.read_csv('sentiment_scores.csv')
161
+ df = df.reset_index().merge(sentiment_scores, on='index')
162
+
163
+ # Clean text function
164
+ def clean_text(text):
165
+ text = re.sub(r'[^\x00-\x7F]+', ' ', text)
166
+ text = re.sub(r'\s+', ' ', text).strip()
167
+ return text
168
+
169
+ df['cleaned_text'] = df['text'].apply(clean_text)
170
+
171
+ # Determine the highest sentiment score for each row
172
+ df['highest_score'] = df[['neg', 'neu', 'pos']].max(axis=1)
173
+ df['sentiment_type'] = df[['neg', 'neu', 'pos']].idxmax(axis=1) # neg/neu/pos as categories
174
+ df['sentiment_type'] = df['sentiment_type'].replace({
175
+ 'neg': 'Negative',
176
+ 'neu': 'Neutral',
177
+ 'pos': 'Positive'
178
+ })
179
+
180
+ # Sidebar: Filters
181
+ st.sidebar.header("Filters")
182
+ sentiment_filter = st.sidebar.multiselect(
183
+ "Select Sentiment Types to Display:",
184
+ options=['Negative', 'Neutral', 'Positive'],
185
+ default=['Negative', 'Neutral', 'Positive']
186
  )
187
+ score_filter = st.sidebar.slider(
188
+ "Select Minimum Sentiment Score:",
189
+ min_value=0.0000,
190
+ max_value=1.0000,
191
+ value=0.0000,
192
+ step=0.0001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  )
194
 
195
+ # Filter the DataFrame to only include points that meet criteria
196
+ filtered_df = df[
197
+ (df['sentiment_type'].isin(sentiment_filter)) & # Match selected sentiment type
198
+ (df['highest_score'] >= score_filter) # Match slider score range
199
+ ]
200
+
201
+ filtered_counts = filtered_df['sentiment_type'].value_counts()
202
+
203
+ # Generate a summary message for the counts
204
+ filtered_summary = (
205
+ f"**Filtered DataFrame:**\n"
206
+ f"- **Negative Sentiments Count:** {filtered_counts.get('Negative', 0)}\n"
207
+ f"- **Neutral Sentiments Count:** {filtered_counts.get('Neutral', 0)}\n"
208
+ f"- **Positive Sentiments Count:** {filtered_counts.get('Positive', 0)}"
209
  )
210
 
 
211
 
212
+ # Display the scatter plot
213
+ scatter_plot = alt.Chart(filtered_df).mark_circle(size=60).encode(
214
+ x=alt.X('index:Q', title='Index'),
215
+ y=alt.Y('highest_score:Q', title='Highest Sentiment Score'),
216
+ color=alt.Color('sentiment_type:N', title='Sentiment Type', scale=alt.Scale(scheme='tableau20')),
217
+ tooltip=['index', 'sentiment_type', 'highest_score', 'cleaned_text', 'text']
218
+ ).properties(
219
+ width=800,
220
+ height=400,
221
+ title="Scatter Plot of Sentiment Scores (Filtered)"
222
+ ).interactive()
223
+
224
+ # Display the scatter plot
225
+ st.altair_chart(scatter_plot, use_container_width=True)
226
+
227
+ # Display the filtered DataFrame and counts
228
+ st.write(filtered_summary)
229
+ st.dataframe(
230
+ filtered_df[['sentiment_type', 'cleaned_text', 'highest_score', 'text']]
231
+ )
232
+
233
+ st.header('''Write Up''')
234
+ multi2 = '''Mentioned in the beginning, as a Asian American I wanted to highlight the xenophobic tweets during
235
+ Covid-19 and using a trained sentiment analysis model to analyze and visualize the tweets was a instant idea
236
+ when I found the dataset. In the first plot, I decided to use an scatter plot to better visualize all the tweets with the highest sentitment score plotted.
237
+ So to do that, I had to compare the negative, neutral, and positive scores and find the highest one. Then using the filters/interactivity, depending on which ones you selected.
238
+ The dataframe and the scatter plot will update accordingly. As an additional layer, on the scatter plot I wanted it to be efficient in comparing the sentiments so the data related
239
+ to the point will appear when hovered over.
240
+
241
+ For the second plot, I wanted for the expert to easily view the dataframe and use it as a secondary reference to the scatter as a better view to make insights because in a table format
242
+ , you're able to see all the columns better. Additionally, I adjusted the columns orders to make it efficient as possible. As a additional layer, there is a count for how many of the
243
+ data points are showing and it is updated according to the filters.
244
+
245
+ For the interactivity, I wanted to have two types of filters with the multiselect first allowing the expert to easily manage the points, and then filter the points again with the sentiment scores.
246
 
247
+ Overall, I am happy with the plots but if I were to have more time, I would definitly load and analyze more data points which is an easy task to do by just changing the number of rows to parse over and
248
+ afking while letting the program run. However, I just don't know how long it will take as I already explored options like the tqdm module to add a progress bar but it doesn't really work in the terminal locally for me.
249
+ '''
250
+ st.markdown(multi2)