File size: 5,298 Bytes
ca12572 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import streamlit as st
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import pandas as pd
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import sys
from datetime import datetime
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.core.os_manager import ChromeType
import re
import transformers
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import io
import plotly.express as px
import zipfile
import torch
from streamlit_extras.stylable_container import stylable_container
tokenizer = transformers.DistilBertTokenizer.from_pretrained("tabularisai/robust-sentiment-analysis")
model = transformers.DistilBertForSequenceClassification.from_pretrained("tabularisai/robust-sentiment-analysis")
def clear_question():
st.session_state["youtube_video_url"] = ""
youtube_video_url = st.text_input("Enter Google Maps URL:", key="youtube_video_url")
st.button("Clear question", on_click=clear_question)
if st.button("Scrape Reviews"):
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--start-maximized")
service = Service(ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install())
driver = webdriver.Chrome(service=service, options=options)
data = []
wait = WebDriverWait(driver, 30)
driver.get(youtube_video_url)
placeholder = st.empty() # Create an empty placeholder for dynamic text
progress_bar = st.progress(0) # Create a progress bar
for item in range(150):
try:
body = WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.TAG_NAME, "body")))
body.send_keys(Keys.END)
placeholder.text(f"Scrolled {item + 1} times") # Update placeholder text
progress_bar.progress((item + 1) / 150) # Update progress bar
time.sleep(3) # Increased sleep time for better loading
except Exception as e:
st.error(f"Exception during scrolling: {e}")
break
placeholder.text("Scrolling complete.") #show completion message.
progress_bar.empty() #remove progress bar.
data = []
try:
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#contents #contents")))
comments = driver.find_elements(By.CSS_SELECTOR, "#content #content-text")
user_id = 1 # Initialize unique user ID
for comment in comments:
timestamp = datetime.now().strftime("%Y-%m-%d")
data.append({"User ID": user_id, "Comment": comment.text, "comment_date": timestamp})
user_id += 1
data = [dict(t) for t in {tuple(d.items()) for d in data}]
except Exception as e:
st.error(f"Exception during comment extraction: {e}")
driver.quit()
df = pd.DataFrame(data, columns=["User ID", "Comment", "comment_date"])
st.dataframe(df)
if tokenizer and model:
inputs = tokenizer(df['Comment'].tolist(), return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
logits = model(**inputs).logits
predicted_probabilities = torch.nn.functional.softmax(logits, dim=-1)
predicted_labels = predicted_probabilities.argmax(dim=1)
results = []
for i, label in enumerate(predicted_labels):
results.append({'Review Number': i + 1, 'Sentiment': model.config.id2label[label.item()]})
sentiment_df = pd.DataFrame(results)
value_counts1 = sentiment_df['Sentiment'].value_counts().rename_axis('Sentiment').reset_index(name='count')
final_df = value_counts1
fig1 = px.pie(final_df, values='count', names='Sentiment', hover_data=['count'], labels={'count': 'count'})
fig1.update_traces(textposition='inside', textinfo='percent+label')
st.plotly_chart(fig1)
result = pd.concat([df, sentiment_df], axis=1)
st.dataframe(result)
fig2 = px.bar(result, x="Sentiment", y="comment_date", color="Sentiment")
st.plotly_chart(fig2)
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w") as myzip:
myzip.writestr("Summary of the results.csv", result.to_csv(index=False))
with stylable_container(
key="download_button",
css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",):
st.download_button(
label="Download zip file",
data=buf.getvalue(),
file_name="zip_file.zip",
mime="application/zip",
)
|