File size: 6,743 Bytes
6fafb78
ca12572
 
 
 
 
 
 
 
 
6fafb78
ca12572
6fafb78
ca12572
 
30bf2ff
ca12572
 
 
eae5fab
 
 
6901dd4
eae5fab
 
 
 
ca12572
eae5fab
ca12572
eae5fab
ca12572
 
eae5fab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30bf2ff
 
 
 
983408d
30bf2ff
6fafb78
 
4307f05
6901dd4
a77c47d
30bf2ff
 
 
 
 
 
 
 
 
 
4307f05
a77c47d
6fafb78
a77c47d
 
6fafb78
 
a77c47d
 
 
 
 
 
 
 
 
 
 
6fafb78
a77c47d
30bf2ff
 
 
6fafb78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import streamlit as st
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.chrome import ChromeType
import transformers
import torch
import plotly.express as px

st.subheader("YouTube Comments Sentiment Analysis", divider="red")
tokenizer = transformers.DistilBertTokenizer.from_pretrained("tabularisai/robust-sentiment-analysis")
model = transformers.DistilBertForSequenceClassification.from_pretrained("tabularisai/robust-sentiment-analysis")

if 'url_count' not in st.session_state:
    st.session_state['url_count'] = 0

max_attempts = 5

def update_url_count():
    st.session_state['url_count'] += 1

def clear_question():
    st.session_state["url"] = ""

url = st.text_input("Enter YouTube URL:", key="url")
st.button("Clear question", on_click=clear_question)

if st.button("Sentiment Analysis", type="secondary"):
    if st.session_state['url_count'] < max_attempts:
        if url:
            with st.spinner("Wait for it...", show_time=True):
                options = Options()
                options.add_argument("--headless")
                options.add_argument("--disable-gpu")
                options.add_argument("--no-sandbox")
                options.add_argument("--disable-dev-shm-usage")
                options.add_argument("--start-maximized")
                service = Service(ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install())
                driver = webdriver.Chrome(service=service, options=options)
                data = []
                wait = WebDriverWait(driver, 30)
                driver.get(url)

                placeholder = st.empty()
                progress_bar = st.progress(0)

                for item in range(30):
                    try:
                        driver.execute_script("window.scrollBy(0, 500);")
                        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#content #content-text")))
                        placeholder.text(f"Scrolled {item + 1} times")
                        progress_bar.progress((item + 1) / 30)
                        time.sleep(1) #Increased wait time for dynamic loading
                    except Exception as e:
                        st.error(f"Exception during scrolling: {e}")
                        break

                placeholder.text("Scrolling complete.")
                progress_bar.empty()

                try:
                    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#contents #contents")))
                    comments = driver.find_elements(By.CSS_SELECTOR, "#content #content-text")
                    for comment in comments:
                        timestamp = "Timestamp not found"
                        try:
                            # Try a more direct XPath
                            timestamp_element = comment.find_element(By.XPATH, './/yt-formatted-string[@class="published-time-text style-scope ytd-comment-renderer"]')
                            timestamp = timestamp_element.text
                        except Exception:
                            try:
                                # Try a more general XPath
                                timestamp_element = comment.find_element(By.XPATH, './ancestor::ytd-comment-renderer//yt-formatted-string[contains(@class, "time-text")]')
                                timestamp = timestamp_element.text
                            except Exception:
                                try:
                                    #try grabbing the a tag.
                                    timestamp_element = comment.find_element(By.XPATH, './ancestor::ytd-comment-renderer//a[@id="time"]')
                                    timestamp = timestamp_element.text
                                except Exception as inner_e:
                                    print(f"Timestamp not found for comment: {comment.text}. Error: {inner_e}") #debug
                        data.append({"Comment": comment.text, "comment_date": timestamp})

                except Exception as e:
                    st.error(f"Exception during comment extraction: {e}")
                driver.quit()
                df = pd.DataFrame(data, columns=["Comment", "comment_date"])

                if not df.empty and not df['Comment'].tolist() == []:
                    st.dataframe(df)
                    inputs = tokenizer(df['Comment'].tolist(), return_tensors="pt", padding=True, truncation=True)
                    with torch.no_grad():
                        logits = model(**inputs).logits
                        predicted_probabilities = torch.nn.functional.softmax(logits, dim=-1)
                        predicted_labels = predicted_probabilities.argmax(dim=1)
                        results = []
                        for i, label in enumerate(predicted_labels):
                            results.append({'Review Number': i + 1, 'Sentiment': model.config.id2label[label.item()]})
                        sentiment_df = pd.DataFrame(results)

                    value_counts1 = sentiment_df['Sentiment'].value_counts().rename_axis('Sentiment').reset_index(name='count')
                    final_df = value_counts1
                    tab1, tab2 = st.tabs(["Pie Chart", "Bar Chart"])
                    with tab1:
                        fig1 = px.pie(final_df, values='count', names='Sentiment', hover_data=['count'], labels={'count': 'count'})
                        fig1.update_traces(textposition='inside', textinfo='percent+label')
                        st.plotly_chart(fig1)

                    result = pd.concat([df, sentiment_df], axis=1)
                    st.dataframe(result)

                    with tab2:
                        fig2 = px.bar(result, x="Sentiment", y="comment_date", color="Sentiment")
                        st.plotly_chart(fig2)

                    csv = result.to_csv(index=False)
                    st.download_button(label="Download data as CSV", data=csv, file_name='Summary of the results.csv', mime='text/csv')
                else:
                    st.warning("No comments were scraped. Sentiment analysis could not be performed.")

        else:
            st.warning("Please enter a URL.")
    else:
        st.warning(f"You have reached the maximum URL attempts ({max_attempts}).")

if 'url_count' in st.session_state:
    st.write(f"URL pasted {st.session_state['url_count']} times.")