Spaces:

nlpblogs
/

youtube-sentiment-analysis-app

Running

App Files Files Community

youtube-sentiment-analysis-app / app.py

nlpblogs

Update app.py

a77c47d verified about 2 months ago

raw

history blame

6.74 kB

	import streamlit as st
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.chrome.service import Service
	import pandas as pd
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	import time
	from webdriver_manager.chrome import ChromeDriverManager
	from webdriver_manager.chrome import ChromeType
	import transformers
	import torch
	import plotly.express as px

	st.subheader("YouTube Comments Sentiment Analysis", divider="red")
	tokenizer = transformers.DistilBertTokenizer.from_pretrained("tabularisai/robust-sentiment-analysis")
	model = transformers.DistilBertForSequenceClassification.from_pretrained("tabularisai/robust-sentiment-analysis")

	if 'url_count' not in st.session_state:
	st.session_state['url_count'] = 0

	max_attempts = 5

	def update_url_count():
	st.session_state['url_count'] += 1

	def clear_question():
	st.session_state["url"] = ""

	url = st.text_input("Enter YouTube URL:", key="url")
	st.button("Clear question", on_click=clear_question)

	if st.button("Sentiment Analysis", type="secondary"):
	if st.session_state['url_count'] < max_attempts:
	if url:
	with st.spinner("Wait for it...", show_time=True):
	options = Options()
	options.add_argument("--headless")
	options.add_argument("--disable-gpu")
	options.add_argument("--no-sandbox")
	options.add_argument("--disable-dev-shm-usage")
	options.add_argument("--start-maximized")
	service = Service(ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install())
	driver = webdriver.Chrome(service=service, options=options)
	data = []
	wait = WebDriverWait(driver, 30)
	driver.get(url)

	placeholder = st.empty()
	progress_bar = st.progress(0)

	for item in range(30):
	try:
	driver.execute_script("window.scrollBy(0, 500);")
	wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#content #content-text")))
	placeholder.text(f"Scrolled {item + 1} times")
	progress_bar.progress((item + 1) / 30)
	time.sleep(1) #Increased wait time for dynamic loading
	except Exception as e:
	st.error(f"Exception during scrolling: {e}")
	break

	placeholder.text("Scrolling complete.")
	progress_bar.empty()

	try:
	wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#contents #contents")))
	comments = driver.find_elements(By.CSS_SELECTOR, "#content #content-text")
	for comment in comments:
	timestamp = "Timestamp not found"
	try:
	# Try a more direct XPath
	timestamp_element = comment.find_element(By.XPATH, './/yt-formatted-string[@class="published-time-text style-scope ytd-comment-renderer"]')
	timestamp = timestamp_element.text
	except Exception:
	try:
	# Try a more general XPath
	timestamp_element = comment.find_element(By.XPATH, './ancestor::ytd-comment-renderer//yt-formatted-string[contains(@class, "time-text")]')
	timestamp = timestamp_element.text
	except Exception:
	try:
	#try grabbing the a tag.
	timestamp_element = comment.find_element(By.XPATH, './ancestor::ytd-comment-renderer//a[@id="time"]')
	timestamp = timestamp_element.text
	except Exception as inner_e:
	print(f"Timestamp not found for comment: {comment.text}. Error: {inner_e}") #debug
	data.append({"Comment": comment.text, "comment_date": timestamp})

	except Exception as e:
	st.error(f"Exception during comment extraction: {e}")
	driver.quit()
	df = pd.DataFrame(data, columns=["Comment", "comment_date"])

	if not df.empty and not df['Comment'].tolist() == []:
	st.dataframe(df)
	inputs = tokenizer(df['Comment'].tolist(), return_tensors="pt", padding=True, truncation=True)
	with torch.no_grad():
	logits = model(**inputs).logits
	predicted_probabilities = torch.nn.functional.softmax(logits, dim=-1)
	predicted_labels = predicted_probabilities.argmax(dim=1)
	results = []
	for i, label in enumerate(predicted_labels):
	results.append({'Review Number': i + 1, 'Sentiment': model.config.id2label[label.item()]})
	sentiment_df = pd.DataFrame(results)

	value_counts1 = sentiment_df['Sentiment'].value_counts().rename_axis('Sentiment').reset_index(name='count')
	final_df = value_counts1
	tab1, tab2 = st.tabs(["Pie Chart", "Bar Chart"])
	with tab1:
	fig1 = px.pie(final_df, values='count', names='Sentiment', hover_data=['count'], labels={'count': 'count'})
	fig1.update_traces(textposition='inside', textinfo='percent+label')
	st.plotly_chart(fig1)

	result = pd.concat([df, sentiment_df], axis=1)
	st.dataframe(result)

	with tab2:
	fig2 = px.bar(result, x="Sentiment", y="comment_date", color="Sentiment")
	st.plotly_chart(fig2)

	csv = result.to_csv(index=False)
	st.download_button(label="Download data as CSV", data=csv, file_name='Summary of the results.csv', mime='text/csv')
	else:
	st.warning("No comments were scraped. Sentiment analysis could not be performed.")

	else:
	st.warning("Please enter a URL.")
	else:
	st.warning(f"You have reached the maximum URL attempts ({max_attempts}).")

	if 'url_count' in st.session_state:
	st.write(f"URL pasted {st.session_state['url_count']} times.")