is445_final_part3

Running

App Files Files Community

is445_final_part3 / test.py

jwu249

Upload test.py

37bbc93 verified 5 months ago

raw

history blame contribute delete

6.75 kB

	import streamlit as st
	import pandas as pd
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	import re
	from nltk.corpus import stopwords
	import nltk

	# Download stopwords
	nltk.download('stopwords')
	stop_words = set(stopwords.words('english'))

	# Title
	st.title("Covid-19 Xenophobic Twitter Tweets Interactive Word Cloud")
	st.write('''### Group: Jason Wu''')

	url = "https://www.kaggle.com/datasets/rahulgoel1106/xenophobia-on-twitter-during-covid19"
	url2 = "https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest"
	url3 = "https://vega.github.io/vega/docs/transforms/wordcloud/"
	url4 = "https://huggingface.co/spaces/jwu249/is445_final"


	multi = '''This visualization aims to help the public understand the xenophobia on Twitter during Covid-19,
	the data used is from the Kaggle dataset linked above. The data in this visualization has been ran through a trained data model set on Twitter for sentiment.
	You should take a look at step 2 for the analyzing and expert visual if you're interested.
	'''
	multi2 = '''#### What does it mean?
	You set the type of sentiment you want to see and set the scores, the closer range is to 1. The more likely the sentiment is what you selected according to the model.'''

	multi3 = '''In this dataset, we're given tweets which is similar to any social media platform where you can post a message, topic, or statement online for people, friends, or family to see.
	The tweets is from a platform called Twitter or X as of now, the tweets was collected during 2020 in the early stages of Covid-19 where xenophobic and Asian hate at an all time high.
	And with Twitter/X being one of the largest free speech platforms in the world. I wanted to analyze these Tweets and determine if the feelings were negative, neutral, or postive during this time.
	This research is important to know because Covid-19 was one of the largest modern global pandemics and how people felt during these times is important to study and understand to prevent future hate and blame.

	The first step was to conduct analysis on all the tweets but in the dataset, there was over 4 million tweets so it would take a very long time
	to analyze all that text on a laptop. So for a sample size, I decided to analyze only 5000 rows of text. The downside of this is that it may not be overall representative of majority of peoples'
	sentiment but if I was to scale this up, I could easily do that as well with more time. So I first had to clean the text which is just removing any words that don't make any sense like random characters #%#@ or ewfwcfx, etc.
	Next I ran the trained model which is just a computer that has judged peoples' feelings so many times that it can confidently judge our texts too. Then the analyzed text scores from the computer is just stored in a text file for us to use.

	For the next part and the visual you're seeing now, it is called a word cloud which is created from that text file we analyzed with the computer. The word cloud counts how many of the same words appear in the text and counts them up. Then the larger the amount the word has,
	the bigger the word becomes in the visual. To help you understand the word cloud better, I added different colors for each sentiments when you select them. Also I add small texts for the interactions to help you understand what they mean. Additionally, included are different sources for inspiration for this visual. So feel free to
	check them out if you want to learn more about word clouds. '''
	st.markdown(multi)
	st.write("Dataset Link to Download -> [Kaggle Covid-19 Xenophobic Datatset](%s)" % url)
	st.write("Trained Sentiment Analyzer -> [Huggicardiffnlp / twitter-roberta-base-sentiment-latest](%s)" % url2)
	st.write("Step 2 Expert Visualization: %s" % url4)
	st.header('''Sentiment Word Cloud''')
	# Load sentiment scores and cleaned text data
	data = pd.read_csv('sentiment_scores.csv')
	df = pd.read_csv('Xenophobia.csv', encoding='latin1', nrows=5000)

	df = df.merge(data, left_index=True, right_on='index')


	# Clean and tokenize text
	def clean_and_tokenize(text):
	text = re.sub(r'[^a-zA-Z\s]', '', str(text)) # Remove non-alphanumeric
	tokens = [word.lower() for word in text.split() if word.lower() not in stop_words]
	return tokens

	df['cleaned_tokens'] = df['text'].apply(clean_and_tokenize)

	# Determine the sentiment type based on the highest score
	df['sentiment_type'] = df[['neg', 'neu', 'pos']].idxmax(axis=1)

	# Sidebar filters
	sentiment = st.selectbox("Select Sentiment (neg = negative \| neu = neutral \| pos = positive)", ['neg', 'neu', 'pos'])
	score_start = st.slider("Min Score \| Closer to 0 = Less likely the sentiment", 0.0, 1.0, 0.0, 0.01)
	score_end = st.slider("Max Score \| Closer to 1 = More likely the sentiment", 0.0, 1.0, 1.0, 0.01)

	# Filter data based on user input
	filtered_df = df[(df['sentiment_type'] == sentiment) & (df[sentiment] >= score_start) & (df[sentiment] <= score_end)]

	# Generate and display word cloud
	if not filtered_df.empty:
	all_words = [word for tokens in filtered_df['cleaned_tokens'] for word in tokens]
	wordcloud = WordCloud(width=800, height=400, background_color='white', colormap={'neg': 'Reds', 'neu': 'Greens', 'pos': 'Blues'}[sentiment])
	wordcloud.generate(' '.join(all_words))

	st.subheader(f"{sentiment.capitalize()} Words Cloud (Score Range: {score_start} - {score_end})")

	# Display the word cloud
	fig, ax = plt.subplots(figsize=(10, 5))
	ax.imshow(wordcloud, interpolation='bilinear')
	ax.axis('off')
	st.pyplot(fig)
	else:
	st.warning("No data matches the selected filters.")

	st.markdown(multi2)
	st.header('''Link to Python Notebook''')
	st.link_button("Notebook", "https://huggingface.co/spaces/jwu249/is445_final_part3/blob/main/test.py")

	st.header('''Inspirations/Context:''')
	st.write('''#### Vega Interactive Word Cloud Example''')
	st.image("wordcloud.png", caption="I originally tried doing an Jekyll wordcloud but I realized that altair and vega-lite don't support wordcloud plots so I had to look at Vega documentation to figure out how to implement the visual. However, I then switched to Streamlit because of it's ease of implementation. Source: %s" % url3)
	st.write('''#### IS445 Data Visualization - Word Clouds ''')
	st.image("class_viz.png", caption="I was originally inspired by the word clouds prep notebook to visualize my public visual as a word cloud because it was a nobrainer as my dataset contained text and the words pop in your eyes allowing the viewer to easily visualize the overall message of the data. Source: https://uiuc-ischool-dataviz.github.io/is445_bcubcg_fall2024/nbv.html?notebook_name=%2Fis445_bcubcg_fall2024%2Fweek16%2FinClass_week16.ipynb")

	st.header('''Write Up:''')
	st.markdown(multi3)