jwu249 commited on
Commit
3404242
·
verified ·
1 Parent(s): 2541b5c

Upload 6 files

Browse files
Files changed (5) hide show
  1. README.md +2 -2
  2. class_viz.png +0 -0
  3. requirements.txt +3 -1
  4. test.py +94 -0
  5. wordcloud.png +0 -0
README.md CHANGED
@@ -1,11 +1,11 @@
1
  ---
2
- title: IS445_HW5
3
  emoji: 🏢
4
  colorFrom: blue
5
  colorTo: gray
6
  sdk: streamlit
7
  sdk_version: 1.39.0
8
- app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
 
1
  ---
2
+ title: IS445_final_part3
3
  emoji: 🏢
4
  colorFrom: blue
5
  colorTo: gray
6
  sdk: streamlit
7
  sdk_version: 1.39.0
8
+ app_file: test.py
9
  pinned: false
10
  license: mit
11
  ---
class_viz.png ADDED
requirements.txt CHANGED
@@ -10,4 +10,6 @@ transformers
10
  torch
11
  torchvision
12
  torchaudio
13
- scipy
 
 
 
10
  torch
11
  torchvision
12
  torchaudio
13
+ scipy
14
+ wordcloud
15
+ ipywidgets
test.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from wordcloud import WordCloud
4
+ import matplotlib.pyplot as plt
5
+ import re
6
+ from nltk.corpus import stopwords
7
+
8
+ # Download stopwords
9
+ # nltk.download('stopwords')
10
+ stop_words = set(stopwords.words('english'))
11
+
12
+ # Title
13
+ st.title("Covid-19 Xenophobic Twitter Tweets Interactive Word Cloud")
14
+ st.write('''### Group: Jason Wu''')
15
+
16
+ url = "https://www.kaggle.com/datasets/rahulgoel1106/xenophobia-on-twitter-during-covid19"
17
+ url2 = "https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest"
18
+ url3 = "https://vega.github.io/vega/docs/transforms/wordcloud/"
19
+ st.write("Dataset Link to Download -> [Kaggle Covid-19 Xenophobic Datatset](%s)" % url)
20
+ st.write("Trained Sentiment Analyzer -> [Huggicardiffnlp / twitter-roberta-base-sentiment-latest](%s)" % url2)
21
+
22
+ multi = '''This visualization aims to help the public understand the xenophobia on Twitter during Covid-19,
23
+ the data used is from the Kaggle dataset linked above. The data in this visualization has been ran through a trained data model set on Twitter for sentiment.
24
+ You should take a look at step 2 for the analyzing and expert visual if you're interested.
25
+ '''
26
+ multi2 = '''#### What does it mean?
27
+ You set the type of sentiment you want to see and set the scores, the closer range is to 1. The more likely the sentiment is what you selected according to the model.'''
28
+
29
+ multi3 = '''In this dataset, we're given tweets which is similar to any social media platform where you can post a message, topic, or statement online for people, friends, or family to see.
30
+ The tweets is from a platform called Twitter or X as of now, the tweets was collected during 2020 in the early stages of Covid-19 where xenophobic and Asian hate at an all time high.
31
+ And with Twitter/X being one of the largest free speech platforms in the world. I wanted to analyze these Tweets and determine if the feelings were negative, neutral, or postive during this time.
32
+ This research is important to know because Covid-19 was one of the largest modern global pandemics and how people felt during these times is important to study and understand to prevent future hate and blame.
33
+
34
+ The first step was to conduct analysis on all the tweets but in the dataset, there was over 4 million tweets so it would take a very long time
35
+ to analyze all that text on a laptop. So for a sample size, I decided to analyze only 5000 rows of text. The downside of this is that it may not be overall representative of majority of peoples'
36
+ sentiment but if I was to scale this up, I could easily do that as well with more time. So I first had to clean the text which is just removing any words that don't make any sense like random characters #%#@ or ewfwcfx, etc.
37
+ Next I ran the trained model which is just a computer that has judged peoples' feelings so many times that it can confidently judge our texts too. Then the analyzed text scores from the computer is just stored in a text file for us to use.
38
+
39
+ For the next part and the visual you're seeing now, it is called a word cloud which is created from that text file we analyzed with the computer. The word cloud counts how many of the same words appear in the text and counts them up. Then the larger the amount the word has,
40
+ the bigger the word becomes in the visual. To help you understand the word cloud better, I added different colors for each sentiments when you select them. Also I add small texts for the interactions to help you understand what they mean. Additionally, included are different sources for inspiration for this visual. So feel free to
41
+ check them out if you want to learn more about word clouds. '''
42
+ st.markdown(multi)
43
+ # Load sentiment scores and cleaned text data
44
+ data = pd.read_csv('sentiment_scores.csv')
45
+ df = pd.read_csv('Xenophobia.csv', encoding='latin1', nrows=5000)
46
+
47
+ df = df.merge(data, left_index=True, right_on='index')
48
+
49
+
50
+ # Clean and tokenize text
51
+ def clean_and_tokenize(text):
52
+ text = re.sub(r'[^a-zA-Z\s]', '', str(text)) # Remove non-alphanumeric
53
+ tokens = [word.lower() for word in text.split() if word.lower() not in stop_words]
54
+ return tokens
55
+
56
+ df['cleaned_tokens'] = df['text'].apply(clean_and_tokenize)
57
+
58
+ # Determine the sentiment type based on the highest score
59
+ df['sentiment_type'] = df[['neg', 'neu', 'pos']].idxmax(axis=1)
60
+
61
+ # Sidebar filters
62
+ sentiment = st.selectbox("Select Sentiment (neg = negative | neu = neutral | pos = positive)", ['neg', 'neu', 'pos'])
63
+ score_start = st.slider("Min Score | Closer to 0 = Less likely the sentiment", 0.0, 1.0, 0.0, 0.01)
64
+ score_end = st.slider("Max Score | Closer to 1 = More likely the sentiment", 0.0, 1.0, 1.0, 0.01)
65
+
66
+ # Filter data based on user input
67
+ filtered_df = df[(df['sentiment_type'] == sentiment) & (df[sentiment] >= score_start) & (df[sentiment] <= score_end)]
68
+
69
+ # Generate and display word cloud
70
+ if not filtered_df.empty:
71
+ all_words = [word for tokens in filtered_df['cleaned_tokens'] for word in tokens]
72
+ wordcloud = WordCloud(width=800, height=400, background_color='white', colormap={'neg': 'Reds', 'neu': 'Greens', 'pos': 'Blues'}[sentiment])
73
+ wordcloud.generate(' '.join(all_words))
74
+
75
+ st.subheader(f"{sentiment.capitalize()} Words Cloud (Score Range: {score_start} - {score_end})")
76
+
77
+ # Display the word cloud
78
+ fig, ax = plt.subplots(figsize=(10, 5))
79
+ ax.imshow(wordcloud, interpolation='bilinear')
80
+ ax.axis('off')
81
+ st.pyplot(fig)
82
+ else:
83
+ st.warning("No data matches the selected filters.")
84
+
85
+ st.markdown(multi2)
86
+
87
+ st.header('''Inspirations/Context:''')
88
+ st.write('''#### Vega Interactive Word Cloud Example''')
89
+ st.image("wordcloud.png", caption="I originally tried doing an Jekyll wordcloud but I realized that altair and vega-lite don't support wordcloud plots so I had to look at Vega documentation to figure out how to implement the visual. However, I then switched to Streamlit because of it's ease of implementation. Source: %s" % url3)
90
+ st.write('''#### IS445 Data Visualization - Word Clouds ''')
91
+ st.image("class_viz.png", caption="I was originally inspired by the word clouds prep notebook to visualize my public visual as a word cloud because it was a nobrainer as my dataset contained text and the words pop in your eyes allowing the viewer to easily visualize the overall message of the data. Source: https://uiuc-ischool-dataviz.github.io/is445_bcubcg_fall2024/nbv.html?notebook_name=%2Fis445_bcubcg_fall2024%2Fweek16%2FinClass_week16.ipynb")
92
+
93
+ st.header('''Write Up:''')
94
+ st.markdown(multi3)
wordcloud.png ADDED