Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
from selenium import webdriver
|
4 |
+
from selenium.webdriver.common.by import By
|
5 |
+
from selenium.webdriver.chrome.options import Options
|
6 |
+
|
7 |
+
from selenium.webdriver.chrome.service import Service
|
8 |
+
|
9 |
+
import pandas as pd
|
10 |
+
|
11 |
+
from selenium.webdriver.common.keys import Keys
|
12 |
+
|
13 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
14 |
+
from selenium.webdriver.support import expected_conditions as EC
|
15 |
+
import time
|
16 |
+
import sys
|
17 |
+
from datetime import datetime
|
18 |
+
|
19 |
+
|
20 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
21 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
22 |
+
|
23 |
+
from webdriver_manager.core.os_manager import ChromeType
|
24 |
+
|
25 |
+
import re
|
26 |
+
|
27 |
+
|
28 |
+
import transformers
|
29 |
+
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
|
30 |
+
import io
|
31 |
+
import plotly.express as px
|
32 |
+
import zipfile
|
33 |
+
import torch
|
34 |
+
from streamlit_extras.stylable_container import stylable_container
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
tokenizer = transformers.DistilBertTokenizer.from_pretrained("tabularisai/robust-sentiment-analysis")
|
44 |
+
model = transformers.DistilBertForSequenceClassification.from_pretrained("tabularisai/robust-sentiment-analysis")
|
45 |
+
|
46 |
+
def clear_question():
|
47 |
+
st.session_state["youtube_video_url"] = ""
|
48 |
+
|
49 |
+
youtube_video_url = st.text_input("Enter Google Maps URL:", key="youtube_video_url")
|
50 |
+
st.button("Clear question", on_click=clear_question)
|
51 |
+
|
52 |
+
if st.button("Scrape Reviews"):
|
53 |
+
options = Options()
|
54 |
+
options.add_argument("--headless")
|
55 |
+
options.add_argument("--disable-gpu")
|
56 |
+
options.add_argument("--no-sandbox")
|
57 |
+
options.add_argument("--disable-dev-shm-usage")
|
58 |
+
options.add_argument("--start-maximized")
|
59 |
+
service = Service(ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install())
|
60 |
+
driver = webdriver.Chrome(service=service, options=options)
|
61 |
+
data = []
|
62 |
+
wait = WebDriverWait(driver, 30)
|
63 |
+
driver.get(youtube_video_url)
|
64 |
+
|
65 |
+
|
66 |
+
placeholder = st.empty() # Create an empty placeholder for dynamic text
|
67 |
+
progress_bar = st.progress(0) # Create a progress bar
|
68 |
+
|
69 |
+
for item in range(150):
|
70 |
+
try:
|
71 |
+
body = WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.TAG_NAME, "body")))
|
72 |
+
body.send_keys(Keys.END)
|
73 |
+
placeholder.text(f"Scrolled {item + 1} times") # Update placeholder text
|
74 |
+
progress_bar.progress((item + 1) / 150) # Update progress bar
|
75 |
+
time.sleep(3) # Increased sleep time for better loading
|
76 |
+
except Exception as e:
|
77 |
+
st.error(f"Exception during scrolling: {e}")
|
78 |
+
break
|
79 |
+
|
80 |
+
placeholder.text("Scrolling complete.") #show completion message.
|
81 |
+
progress_bar.empty() #remove progress bar.
|
82 |
+
|
83 |
+
|
84 |
+
data = []
|
85 |
+
try:
|
86 |
+
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#contents #contents")))
|
87 |
+
comments = driver.find_elements(By.CSS_SELECTOR, "#content #content-text")
|
88 |
+
|
89 |
+
user_id = 1 # Initialize unique user ID
|
90 |
+
for comment in comments:
|
91 |
+
timestamp = datetime.now().strftime("%Y-%m-%d")
|
92 |
+
data.append({"User ID": user_id, "Comment": comment.text, "comment_date": timestamp})
|
93 |
+
user_id += 1
|
94 |
+
data = [dict(t) for t in {tuple(d.items()) for d in data}]
|
95 |
+
|
96 |
+
except Exception as e:
|
97 |
+
st.error(f"Exception during comment extraction: {e}")
|
98 |
+
|
99 |
+
driver.quit()
|
100 |
+
df = pd.DataFrame(data, columns=["User ID", "Comment", "comment_date"])
|
101 |
+
st.dataframe(df)
|
102 |
+
|
103 |
+
if tokenizer and model:
|
104 |
+
inputs = tokenizer(df['Comment'].tolist(), return_tensors="pt", padding=True, truncation=True)
|
105 |
+
with torch.no_grad():
|
106 |
+
logits = model(**inputs).logits
|
107 |
+
predicted_probabilities = torch.nn.functional.softmax(logits, dim=-1)
|
108 |
+
predicted_labels = predicted_probabilities.argmax(dim=1)
|
109 |
+
results = []
|
110 |
+
for i, label in enumerate(predicted_labels):
|
111 |
+
results.append({'Review Number': i + 1, 'Sentiment': model.config.id2label[label.item()]})
|
112 |
+
sentiment_df = pd.DataFrame(results)
|
113 |
+
value_counts1 = sentiment_df['Sentiment'].value_counts().rename_axis('Sentiment').reset_index(name='count')
|
114 |
+
final_df = value_counts1
|
115 |
+
fig1 = px.pie(final_df, values='count', names='Sentiment', hover_data=['count'], labels={'count': 'count'})
|
116 |
+
fig1.update_traces(textposition='inside', textinfo='percent+label')
|
117 |
+
st.plotly_chart(fig1)
|
118 |
+
|
119 |
+
|
120 |
+
result = pd.concat([df, sentiment_df], axis=1)
|
121 |
+
st.dataframe(result)
|
122 |
+
|
123 |
+
|
124 |
+
fig2 = px.bar(result, x="Sentiment", y="comment_date", color="Sentiment")
|
125 |
+
|
126 |
+
st.plotly_chart(fig2)
|
127 |
+
|
128 |
+
buf = io.BytesIO()
|
129 |
+
with zipfile.ZipFile(buf, "w") as myzip:
|
130 |
+
myzip.writestr("Summary of the results.csv", result.to_csv(index=False))
|
131 |
+
with stylable_container(
|
132 |
+
key="download_button",
|
133 |
+
css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",):
|
134 |
+
st.download_button(
|
135 |
+
label="Download zip file",
|
136 |
+
data=buf.getvalue(),
|
137 |
+
file_name="zip_file.zip",
|
138 |
+
mime="application/zip",
|
139 |
+
)
|
140 |
+
|
141 |
+
|
142 |
+
|
143 |
+
|
144 |
+
|
145 |
+
|
146 |
+
|