Spaces:
Running
Running
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +543 -16
src/streamlit_app.py
CHANGED
@@ -1,10 +1,15 @@
|
|
1 |
import os
|
2 |
os.environ["STREAMLIT_GLOBAL_CONFIG"] = "/data/.streamlit/config.toml"
|
|
|
3 |
import uuid
|
4 |
import random
|
5 |
import urllib.parse # To parse URL parameters
|
|
|
6 |
import streamlit as st
|
|
|
|
|
7 |
import duckdb
|
|
|
8 |
|
9 |
# Database file path
|
10 |
DB_PATH = 'steampolis.duckdb'
|
@@ -92,8 +97,473 @@ def initialize_database():
|
|
92 |
if 'init_con' in locals() and init_con:
|
93 |
init_con.close()
|
94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
# Helper function to get a random unvoted comment
|
96 |
def get_random_unvoted_comment(user_id, topic_id):
|
|
|
|
|
|
|
|
|
|
|
97 |
local_con = None
|
98 |
try:
|
99 |
local_con = duckdb.connect(database=DB_PATH, read_only=False)
|
@@ -104,7 +574,7 @@ def get_random_unvoted_comment(user_id, topic_id):
|
|
104 |
""", [topic_id]).fetchone()[0]
|
105 |
|
106 |
if comment_count == 0:
|
107 |
-
return None, "
|
108 |
|
109 |
# Attempt to get a random comment that the user has NOT voted on
|
110 |
result = local_con.execute("""
|
@@ -118,8 +588,27 @@ def get_random_unvoted_comment(user_id, topic_id):
|
|
118 |
ORDER BY RANDOM()
|
119 |
LIMIT 1
|
120 |
""", [topic_id, user_id]).fetchone()
|
121 |
-
|
122 |
if result:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
# Found an unvoted comment
|
124 |
return result[0], result[1]
|
125 |
else:
|
@@ -220,12 +709,12 @@ def home_page():
|
|
220 |
st.title("Welcome to SteamPolis")
|
221 |
st.markdown("Choose an option:")
|
222 |
|
223 |
-
if st.button("Create New Topic"):
|
224 |
st.session_state.page = 'create_topic'
|
225 |
st.rerun()
|
226 |
|
227 |
st.markdown("---")
|
228 |
-
st.markdown("Or join an existing topic:")
|
229 |
topic_input = st.text_input("Enter Topic ID or URL")
|
230 |
|
231 |
if st.button("Join Topic"):
|
@@ -308,6 +797,7 @@ def view_topic_page():
|
|
308 |
current_comment_id = st.session_state.get('current_comment_id')
|
309 |
current_comment_content = st.session_state.get('current_comment_content', "Loading comments...")
|
310 |
comment_history = st.session_state.get('comment_history', "")
|
|
|
311 |
|
312 |
if not topic_id:
|
313 |
st.warning("No topic selected. Returning to home.")
|
@@ -342,7 +832,7 @@ def view_topic_page():
|
|
342 |
|
343 |
|
344 |
# Include functional information
|
345 |
-
st.markdown(f"**Quest Scroll ID:** `{topic_id}`")
|
346 |
# Construct shareable link using current app URL
|
347 |
app_url = st.query_params.get('base', ['http://localhost:8501/'])[0] # Get base URL if available
|
348 |
shareable_link = f"{app_url}?topic={topic_id}" if app_url else f"?topic={topic_id}"
|
@@ -436,24 +926,50 @@ def view_topic_page():
|
|
436 |
]
|
437 |
# Randomly select a phrase
|
438 |
random_phrase = random.choice(intro_phrases)
|
|
|
439 |
|
440 |
if current_comment_id: # Only show voting if there's a comment to vote on
|
441 |
# Display comment history and the current comment with the random intro
|
442 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
443 |
|
444 |
# Handle vote logic
|
445 |
def handle_vote(vote_type, comment_id, topic_id, user_id):
|
|
|
|
|
|
|
|
|
446 |
local_con = None
|
447 |
try:
|
448 |
local_con = duckdb.connect(database=DB_PATH, read_only=False)
|
449 |
-
|
|
|
|
|
|
|
|
|
|
|
450 |
local_con.execute("""
|
451 |
INSERT INTO votes (id, user_id, comment_id, vote_type)
|
452 |
VALUES (?, ?, ?, ?)
|
|
|
|
|
|
|
|
|
453 |
""", [vote_id, user_id, comment_id, vote_type])
|
454 |
|
455 |
# Append voted comment to history
|
|
|
|
|
456 |
vote_text = "π" if vote_type == "agree" else "π" if vote_type == "disagree" else "π"
|
|
|
|
|
|
|
|
|
457 |
st.session_state.comment_history += f"\n\n{vote_text} {current_comment_content}"
|
458 |
|
459 |
# Check vote count and trigger special event
|
@@ -461,11 +977,14 @@ def view_topic_page():
|
|
461 |
if 'vote_count' not in st.session_state:
|
462 |
st.session_state.vote_count = 0
|
463 |
|
|
|
|
|
|
|
464 |
st.session_state.vote_count += 1
|
465 |
|
466 |
-
# Check if it's time for a potential special event (every 5 votes)
|
467 |
if st.session_state.vote_count % 5 == 0:
|
468 |
-
st.session_state.vote_count = 0
|
469 |
# 30% chance to trigger the special sharing event
|
470 |
if random.random() < 0.3:
|
471 |
prompts = [
|
@@ -473,14 +992,17 @@ def view_topic_page():
|
|
473 |
"A letter arrives from the Emperor's office, requesting your personal insight on the matter. What counsel do you offer?",
|
474 |
"As you walk through the streets, people gather, eager to hear your thoughts on the Emperor's dilemma. What advice do you give?"
|
475 |
]
|
|
|
476 |
share_wisdom(random.choice(prompts), allow_skip=True)
|
477 |
|
478 |
# Get next comment
|
|
|
479 |
next_comment_id, next_comment_content = get_random_unvoted_comment(user_id, topic_id)
|
480 |
st.session_state.current_comment_id = next_comment_id
|
481 |
st.session_state.current_comment_content = next_comment_content
|
482 |
|
483 |
# Update progress
|
|
|
484 |
update_user_progress(user_id, topic_id, next_comment_id)
|
485 |
|
486 |
st.rerun() # Rerun to update UI
|
@@ -531,19 +1053,24 @@ if 'current_comment_content' not in st.session_state:
|
|
531 |
st.session_state.current_comment_content = "Loading comments..."
|
532 |
if 'comment_history' not in st.session_state:
|
533 |
st.session_state.comment_history = ""
|
|
|
|
|
534 |
|
535 |
# Initialize the database on first run
|
536 |
initialize_database()
|
537 |
|
538 |
# Handle initial load from URL query parameters
|
|
|
539 |
query_params = st.query_params
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
|
|
|
|
547 |
|
548 |
|
549 |
# Render the appropriate page based on session state
|
|
|
1 |
import os
|
2 |
os.environ["STREAMLIT_GLOBAL_CONFIG"] = "/data/.streamlit/config.toml"
|
3 |
+
import time
|
4 |
import uuid
|
5 |
import random
|
6 |
import urllib.parse # To parse URL parameters
|
7 |
+
from functools import lru_cache
|
8 |
import streamlit as st
|
9 |
+
import numpy as np
|
10 |
+
import pandas as pd
|
11 |
import duckdb
|
12 |
+
import hdbscan
|
13 |
|
14 |
# Database file path
|
15 |
DB_PATH = 'steampolis.duckdb'
|
|
|
97 |
if 'init_con' in locals() and init_con:
|
98 |
init_con.close()
|
99 |
|
100 |
+
def get_ttl_hash(seconds=360):
|
101 |
+
"""Return the same value withing `seconds` time period"""
|
102 |
+
return round(time.time() / seconds)
|
103 |
+
|
104 |
+
# Helper function to get the R matrix from user voting data
|
105 |
+
# This matrix represents user-comment interactions (votes)
|
106 |
+
# Users are rows, comments are columns.
|
107 |
+
# Values: 1 for 'agree', 0 for 'neutral', -1 for 'disagree', NaN for unvoted.
|
108 |
+
# Requires pandas and numpy.
|
109 |
+
def get_r_matrix_from_votes():
|
110 |
+
local_con = None
|
111 |
+
try:
|
112 |
+
local_con = duckdb.connect(database=DB_PATH, read_only=True) # Read-only is sufficient
|
113 |
+
|
114 |
+
# Fetch all vote data
|
115 |
+
# fetchdf requires pandas
|
116 |
+
votes_df = local_con.execute("""
|
117 |
+
SELECT user_id, comment_id, vote_type
|
118 |
+
FROM votes
|
119 |
+
""").fetchdf()
|
120 |
+
|
121 |
+
if votes_df.empty:
|
122 |
+
# Return empty matrix and mappings if no votes exist
|
123 |
+
# pd.DataFrame requires pandas
|
124 |
+
return pd.DataFrame(), {}, {}
|
125 |
+
|
126 |
+
# Map vote types to numerical values
|
127 |
+
vote_mapping = {'agree': 1, 'neutral': 0, 'disagree': -1}
|
128 |
+
votes_df['vote_value'] = votes_df['vote_type'].map(vote_mapping)
|
129 |
+
|
130 |
+
# Create the R matrix using pivot_table
|
131 |
+
# This automatically handles missing user-comment pairs by filling with NaN
|
132 |
+
# pivot_table requires pandas
|
133 |
+
r_matrix = votes_df.pivot_table(
|
134 |
+
index='user_id',
|
135 |
+
columns='comment_id',
|
136 |
+
values='vote_value'
|
137 |
+
)
|
138 |
+
|
139 |
+
# Create mappings from user/comment IDs to matrix indices (optional but useful)
|
140 |
+
user_id_to_index = {user_id: i for i, user_id in enumerate(r_matrix.index)}
|
141 |
+
comment_id_to_index = {comment_id: i for i, comment_id in enumerate(r_matrix.columns)}
|
142 |
+
|
143 |
+
return r_matrix, user_id_to_index, comment_id_to_index
|
144 |
+
|
145 |
+
except Exception as e:
|
146 |
+
# st.error is not available here, just print or log
|
147 |
+
print(f"Error generating R matrix: {e}")
|
148 |
+
# Return empty results in case of error
|
149 |
+
# pd.DataFrame requires pandas
|
150 |
+
return pd.DataFrame(), {}, {}
|
151 |
+
finally:
|
152 |
+
if local_con:
|
153 |
+
local_con.close()
|
154 |
+
|
155 |
+
|
156 |
+
# Custom Hamming-like distance function handling NaNs for clustering
|
157 |
+
# Assumes numpy is imported as np
|
158 |
+
def hamming_distance_with_nan(u1, u2):
|
159 |
+
"""
|
160 |
+
Calculates a Hamming-like distance between two vectors (user vote profiles)
|
161 |
+
ignoring positions where either value is NaN.
|
162 |
+
|
163 |
+
Args:
|
164 |
+
u1 (np.ndarray or pd.Series): First vector.
|
165 |
+
u2 (np.ndarray or pd.Series): Second vector.
|
166 |
+
|
167 |
+
Returns:
|
168 |
+
float: The proportion of differing elements among non-NaN positions.
|
169 |
+
Returns 0.0 if vectors are identical (including all NaN),
|
170 |
+
1.0 if different but no common non-NaN positions.
|
171 |
+
"""
|
172 |
+
u1 = np.asarray(u1)
|
173 |
+
u2 = np.asarray(u2)
|
174 |
+
|
175 |
+
# Find positions where both are not NaN
|
176 |
+
both_not_nan_mask = ~np.isnan(u1) & ~np.isnan(u2)
|
177 |
+
|
178 |
+
# If no common non-NaN values
|
179 |
+
if not np.any(both_not_nan_mask):
|
180 |
+
# If vectors are identical (e.g., both all NaN), distance is 0.
|
181 |
+
# If different vectors with no common non-NaN, distance is 1 (max difference).
|
182 |
+
if np.array_equal(u1, u2, equal_nan=True):
|
183 |
+
return 0.0
|
184 |
+
else:
|
185 |
+
return 1.0
|
186 |
+
|
187 |
+
# Filter to only positions where both are not NaN
|
188 |
+
u1_filtered = u1[both_not_nan_mask]
|
189 |
+
u2_filtered = u2[both_not_nan_mask]
|
190 |
+
|
191 |
+
# Calculate proportion of differing elements among common non-NaN positions
|
192 |
+
diff_count = np.sum(u1_filtered != u2_filtered)
|
193 |
+
total_count = len(u1_filtered)
|
194 |
+
|
195 |
+
return diff_count / total_count
|
196 |
+
|
197 |
+
|
198 |
+
# Function to get clusters using HDBSCAN with the custom Hamming distance
|
199 |
+
# Assumes pandas is imported as pd, numpy as np, and hdbscan is imported
|
200 |
+
def get_clusters_from_r_matrix(r_matrix):
|
201 |
+
"""
|
202 |
+
Performs HDBSCAN clustering on the R matrix using a custom Hamming-like distance
|
203 |
+
that handles NaN values.
|
204 |
+
|
205 |
+
Args:
|
206 |
+
r_matrix (pd.DataFrame): The user-comment vote matrix from get_r_matrix_from_votes.
|
207 |
+
Index should be user_id, columns comment_id.
|
208 |
+
|
209 |
+
Returns:
|
210 |
+
np.ndarray: An array of cluster labels for each user in the r_matrix index.
|
211 |
+
-1 indicates noise. Returns empty array if clustering fails or
|
212 |
+
r_matrix is empty.
|
213 |
+
"""
|
214 |
+
# Check if r_matrix is empty
|
215 |
+
if r_matrix.empty:
|
216 |
+
print("R matrix is empty, cannot perform clustering.")
|
217 |
+
return np.array([])
|
218 |
+
|
219 |
+
try:
|
220 |
+
# Instantiate HDBSCAN with the custom metric
|
221 |
+
# Using default parameters for min_cluster_size and min_samples
|
222 |
+
# These might need tuning based on data characteristics and desired cluster granularity
|
223 |
+
# allow_single_cluster=True prevents an error if all points form one cluster
|
224 |
+
clusterer = hdbscan.HDBSCAN(metric=hamming_distance_with_nan, allow_single_cluster=True)
|
225 |
+
|
226 |
+
# Fit the model directly to the DataFrame values
|
227 |
+
# HDBSCAN fit expects a numpy array or similar structure
|
228 |
+
clusterer.fit(r_matrix.values)
|
229 |
+
|
230 |
+
# Return the cluster labels
|
231 |
+
return clusterer.labels_
|
232 |
+
|
233 |
+
except Exception as e:
|
234 |
+
# In a Streamlit app context, st.error would be better, but not available here.
|
235 |
+
# Print to console/logs.
|
236 |
+
print(f"Error during HDBSCAN clustering: {e}")
|
237 |
+
return np.array([]) # Return empty array on error
|
238 |
+
|
239 |
+
|
240 |
+
def get_cluster_labels():
|
241 |
+
r_matrix, user_id_to_index, _ = get_r_matrix_from_votes()
|
242 |
+
cluster_labels = get_clusters_from_r_matrix(r_matrix)
|
243 |
+
if len(cluster_labels) == 0:
|
244 |
+
cluster_labels = [0] * len(user_id_to_index)
|
245 |
+
return cluster_labels, user_id_to_index
|
246 |
+
|
247 |
+
|
248 |
+
# Function to get the cluster label for a specific user
|
249 |
+
@lru_cache()
|
250 |
+
def get_user_cluster_label(user_id, ttl_hash=None):
|
251 |
+
"""
|
252 |
+
Gets the HDBSCAN cluster label for a specific user and a list of users
|
253 |
+
sharing the same cluster.
|
254 |
+
|
255 |
+
Args:
|
256 |
+
user_id (str): The ID of the user.
|
257 |
+
|
258 |
+
Returns:
|
259 |
+
tuple: A tuple containing:
|
260 |
+
- int or None: The cluster label (an integer, -1 for noise) if the user
|
261 |
+
is found in the clustering result, otherwise None.
|
262 |
+
- list[str]: A list of user IDs (including the input user_id if found)
|
263 |
+
that belong to the same cluster. Returns an empty list
|
264 |
+
if the user is not found or has no cluster label.
|
265 |
+
"""
|
266 |
+
# get_cluster_labels is already cached, so calling it repeatedly is fine
|
267 |
+
cluster_labels, user_id_to_index = get_cluster_labels()
|
268 |
+
|
269 |
+
# Create a reverse mapping from index to user_id for easier lookup
|
270 |
+
index_to_user_id = {index: uid for uid, index in user_id_to_index.items()}
|
271 |
+
|
272 |
+
target_cluster_label = None
|
273 |
+
same_cluster_users = []
|
274 |
+
|
275 |
+
# Check if the user_id exists in the mapping
|
276 |
+
if user_id in user_id_to_index:
|
277 |
+
user_index = user_id_to_index[user_id]
|
278 |
+
# Ensure the index is within the bounds of the cluster_labels array
|
279 |
+
if 0 <= user_index < len(cluster_labels):
|
280 |
+
target_cluster_label = int(cluster_labels[user_index]) # Get the target label
|
281 |
+
|
282 |
+
# Find all users with the same cluster label
|
283 |
+
for index, current_user_id in index_to_user_id.items():
|
284 |
+
# Ensure the index is valid for cluster_labels
|
285 |
+
if 0 <= index < len(cluster_labels):
|
286 |
+
current_user_label = int(cluster_labels[index])
|
287 |
+
if current_user_label == target_cluster_label:
|
288 |
+
same_cluster_users.append(current_user_id)
|
289 |
+
else:
|
290 |
+
# This case should ideally not happen if index_to_user_id is consistent
|
291 |
+
print(f"Warning: Index {index} from index_to_user_id out of bounds for cluster labels array length {len(cluster_labels)}")
|
292 |
+
|
293 |
+
|
294 |
+
else:
|
295 |
+
# This case should ideally not happen if user_id_to_index is consistent
|
296 |
+
print(f"Warning: User index {user_index} out of bounds for cluster labels array length {len(cluster_labels)}")
|
297 |
+
# Return None and empty list as user couldn't be processed
|
298 |
+
return None, []
|
299 |
+
else:
|
300 |
+
# User not found in the R matrix used for clustering (e.g., new user with no votes)
|
301 |
+
# print(f"User ID {user_id} not found in clustering data.") # Optional: for debugging
|
302 |
+
# Return None and empty list as user is not part of the current clustering result
|
303 |
+
return None, []
|
304 |
+
|
305 |
+
# Return the target user's label and the list of users in that cluster
|
306 |
+
return target_cluster_label, same_cluster_users
|
307 |
+
|
308 |
+
|
309 |
+
# Helper function to get top k most polarized comments for a list of users
|
310 |
+
def get_top_k_polarized_comments_for_users(user_ids, k=5):
|
311 |
+
"""
|
312 |
+
Retrieves the top k comments most agreed or disagreed upon (most polarized)
|
313 |
+
by a given list of users.
|
314 |
+
|
315 |
+
Args:
|
316 |
+
user_ids (list[str]): A list of user IDs.
|
317 |
+
k (int): The number of top comments to retrieve.
|
318 |
+
|
319 |
+
Returns:
|
320 |
+
list[tuple]: A list of tuples, where each tuple contains
|
321 |
+
(comment_id, comment_content, average_vote_score),
|
322 |
+
ordered by the absolute value of the average score descending.
|
323 |
+
Returns an empty list if no votes are found for these users
|
324 |
+
or on error.
|
325 |
+
"""
|
326 |
+
if not user_ids:
|
327 |
+
# print("Warning: get_top_k_polarized_comments_for_users called with empty user_ids list.") # Optional debug
|
328 |
+
return [] # Cannot query without user IDs
|
329 |
+
|
330 |
+
local_con = None
|
331 |
+
try:
|
332 |
+
local_con = duckdb.connect(database=DB_PATH, read_only=True)
|
333 |
+
|
334 |
+
# Use parameterized query for the list of user IDs
|
335 |
+
# DuckDB's Python API handles lists for IN clauses
|
336 |
+
query = """
|
337 |
+
SELECT
|
338 |
+
v.comment_id,
|
339 |
+
c.content,
|
340 |
+
AVG(CASE
|
341 |
+
WHEN v.vote_type = 'agree' THEN 1.0
|
342 |
+
WHEN v.vote_type = 'neutral' THEN 0.0
|
343 |
+
WHEN v.vote_type = 'disagree' THEN -1.0
|
344 |
+
ELSE NULL -- Should not happen with current data
|
345 |
+
END) as average_vote_score
|
346 |
+
FROM votes v
|
347 |
+
JOIN comments c ON v.comment_id = c.id
|
348 |
+
WHERE v.user_id IN (?)
|
349 |
+
GROUP BY v.comment_id, c.content
|
350 |
+
HAVING COUNT(v.user_id) > 0 -- Ensure at least one user from the list voted on this comment
|
351 |
+
ORDER BY ABS(average_vote_score) DESC
|
352 |
+
LIMIT ?
|
353 |
+
"""
|
354 |
+
# Pass the list of user_ids and k as parameters
|
355 |
+
result = local_con.execute(query, [user_ids, k]).fetchall()
|
356 |
+
|
357 |
+
return result
|
358 |
+
|
359 |
+
except Exception as e:
|
360 |
+
# st.error is not available here, just print or log
|
361 |
+
print(f"Error getting top k polarized comments for users {user_ids}: {e}")
|
362 |
+
return [] # Return empty list on error
|
363 |
+
finally:
|
364 |
+
if local_con:
|
365 |
+
local_con.close()
|
366 |
+
|
367 |
+
|
368 |
+
@lru_cache()
|
369 |
+
def estimate_group_voting_diversity(user_ids, topic_id):
|
370 |
+
"""
|
371 |
+
Estimates the diversity of voting within a group of users for a specific topic.
|
372 |
+
Diversity is measured by the average variance of numerical vote scores (-1, 0, 1)
|
373 |
+
across comments that at least two users in the group have voted on.
|
374 |
+
|
375 |
+
Args:
|
376 |
+
user_ids (list[str]): A list of user IDs belonging to the group.
|
377 |
+
topic_id (str): The ID of the topic.
|
378 |
+
|
379 |
+
Returns:
|
380 |
+
float: A diversity score between 0.0 and 1.0. 0.0 indicates no diversity
|
381 |
+
(all users voted the same way on all shared comments), 1.0 indicates
|
382 |
+
maximum possible diversity (e.g., half agree, half disagree on shared comments).
|
383 |
+
Returns 0.0 if the group has less than 2 users or if no comments
|
384 |
+
were voted on by at least two users in the group.
|
385 |
+
"""
|
386 |
+
if not user_ids or len(user_ids) < 2:
|
387 |
+
return 0.0
|
388 |
+
|
389 |
+
local_con = None
|
390 |
+
try:
|
391 |
+
local_con = duckdb.connect(database=DB_PATH, read_only=True)
|
392 |
+
|
393 |
+
# Get all votes for the given topic by the specified users
|
394 |
+
# Join with comments to filter by topic_id
|
395 |
+
query = """
|
396 |
+
SELECT
|
397 |
+
v.comment_id,
|
398 |
+
v.user_id,
|
399 |
+
v.vote_type
|
400 |
+
FROM votes v
|
401 |
+
JOIN comments c ON v.comment_id = c.id
|
402 |
+
WHERE c.topic_id = ? AND v.user_id IN (?)
|
403 |
+
"""
|
404 |
+
# DuckDB's Python API handles lists for IN clauses
|
405 |
+
results = local_con.execute(query, [topic_id, user_ids]).fetchall()
|
406 |
+
|
407 |
+
if not results:
|
408 |
+
return 0.0 # No votes found for this group on this topic
|
409 |
+
|
410 |
+
# Map vote types to numerical scores
|
411 |
+
vote_map = {'agree': 1.0, 'neutral': 0.0, 'disagree': -1.0}
|
412 |
+
|
413 |
+
# Group votes by comment ID
|
414 |
+
votes_by_comment = {}
|
415 |
+
for comment_id, user_id, vote_type in results:
|
416 |
+
if comment_id not in votes_by_comment:
|
417 |
+
votes_by_comment[comment_id] = []
|
418 |
+
# Append the numerical vote score
|
419 |
+
votes_by_comment[comment_id].append(vote_map.get(vote_type, 0.0)) # Default to 0.0 for unknown types
|
420 |
+
|
421 |
+
# Calculate variance for comments voted on by at least two users in the group
|
422 |
+
variances = []
|
423 |
+
for comment_id, comment_votes in votes_by_comment.items():
|
424 |
+
# Ensure the comment was voted on by at least two users from the input list
|
425 |
+
if len(comment_votes) >= 2:
|
426 |
+
# Use numpy to calculate variance
|
427 |
+
variances.append(np.var(comment_votes))
|
428 |
+
|
429 |
+
if not variances:
|
430 |
+
return 0.0 # No comments voted on by at least two users in the group
|
431 |
+
|
432 |
+
# The maximum possible variance for values in [-1, 0, 1] is 1.0
|
433 |
+
# (e.g., half votes are 1, half are -1).
|
434 |
+
# The average variance is already in the range [0, 1].
|
435 |
+
average_variance = np.mean(variances)
|
436 |
+
|
437 |
+
return average_variance
|
438 |
+
|
439 |
+
except Exception as e:
|
440 |
+
# st.error is not available here, just print or log
|
441 |
+
print(f"Error estimating group voting diversity for topic {topic_id} and users {user_ids}: {e}")
|
442 |
+
return 0.0 # Return 0.0 on error
|
443 |
+
finally:
|
444 |
+
if local_con:
|
445 |
+
local_con.close()
|
446 |
+
|
447 |
+
|
448 |
+
# Helper function to name a group of users based on their participation and voting diversity
|
449 |
+
def name_user_group(user_ids, topic_id):
|
450 |
+
"""
|
451 |
+
Generates a descriptive name and description for a group of users within a
|
452 |
+
specific topic based on their participation level and voting diversity.
|
453 |
+
|
454 |
+
Args:
|
455 |
+
user_ids (list[str]): A list of user IDs belonging to the group.
|
456 |
+
topic_id (str): The ID of the topic.
|
457 |
+
|
458 |
+
Returns:
|
459 |
+
tuple[str, str]: A tuple containing the name and description for the group.
|
460 |
+
Returns ("Silent Gathering", "This group has no members.")
|
461 |
+
or ("Unengaged Group", "No members of this group have voted on this topic.")
|
462 |
+
or ("Isolated Voices", "This topic has no voters yet.")
|
463 |
+
or ("Mysterious Gathering", "An error occurred while trying to name this group.")
|
464 |
+
in edge cases or on error.
|
465 |
+
"""
|
466 |
+
# Handle empty user list
|
467 |
+
if not user_ids:
|
468 |
+
return "Silent Gathering", "This group has no members."
|
469 |
+
|
470 |
+
local_con = None
|
471 |
+
try:
|
472 |
+
local_con = duckdb.connect(database=DB_PATH, read_only=True)
|
473 |
+
|
474 |
+
# 1. Get total unique users who voted in the topic
|
475 |
+
total_voters_result = local_con.execute("""
|
476 |
+
SELECT COUNT(DISTINCT user_id)
|
477 |
+
FROM votes v
|
478 |
+
JOIN comments c ON v.comment_id = c.id
|
479 |
+
WHERE c.topic_id = ?
|
480 |
+
""", [topic_id]).fetchone()
|
481 |
+
total_voters_in_topic = total_voters_result[0] if total_voters_result else 0
|
482 |
+
|
483 |
+
# 2. Get unique users from the input list who voted in the topic
|
484 |
+
# Filter user_ids to only those present in the votes table for this topic
|
485 |
+
# DuckDB IN clause handles lists directly
|
486 |
+
group_voters_result = local_con.execute("""
|
487 |
+
SELECT COUNT(DISTINCT user_id)
|
488 |
+
FROM votes v
|
489 |
+
JOIN comments c ON v.comment_id = c.id
|
490 |
+
WHERE c.topic_id = ? AND v.user_id IN (?)
|
491 |
+
""", [topic_id, user_ids]).fetchone()
|
492 |
+
group_voters_count = group_voters_result[0] if group_voters_result else 0
|
493 |
+
|
494 |
+
# Handle case where no one in the group has voted on this topic
|
495 |
+
if group_voters_count == 0:
|
496 |
+
return "Unengaged Group", "No members of this group have voted on this topic."
|
497 |
+
|
498 |
+
# Handle case where topic has no voters but the group somehow has voters (shouldn't happen if queries are correct)
|
499 |
+
if total_voters_in_topic == 0:
|
500 |
+
# This case is unlikely if group_voters_count > 0, but for safety
|
501 |
+
return "Isolated Voices", "This topic has no voters yet."
|
502 |
+
|
503 |
+
|
504 |
+
# 3. Calculate significance (proportion of group voters among all topic voters)
|
505 |
+
significance_proportion = group_voters_count / total_voters_in_topic
|
506 |
+
|
507 |
+
# 4. Get diversity score for the group
|
508 |
+
diversity_score = estimate_group_voting_diversity(user_ids, topic_id)
|
509 |
+
|
510 |
+
# 5. Determine name and description based on significance and diversity
|
511 |
+
# Define thresholds (can be tuned)
|
512 |
+
SIG_LOW_THRESHOLD = 0.1
|
513 |
+
SIG_MED_THRESHOLD = 0.5 # High if > MED, Med if > LOW and <= MED, Low if <= LOW
|
514 |
+
DIV_LOW_THRESHOLD = 0.2
|
515 |
+
DIV_MED_THRESHOLD = 0.5 # High if > MED, Med if > LOW and <= MED, Low if <= LOW
|
516 |
+
|
517 |
+
significance_level = "low"
|
518 |
+
if significance_proportion > SIG_MED_THRESHOLD:
|
519 |
+
significance_level = "high"
|
520 |
+
elif significance_proportion > SIG_LOW_THRESHOLD:
|
521 |
+
significance_level = "medium"
|
522 |
+
|
523 |
+
diversity_level = "low"
|
524 |
+
if diversity_score > DIV_MED_THRESHOLD:
|
525 |
+
diversity_level = "high"
|
526 |
+
elif diversity_score > DIV_LOW_THRESHOLD:
|
527 |
+
diversity_level = "medium"
|
528 |
+
|
529 |
+
# Assign names and descriptions based on levels
|
530 |
+
if significance_level == "high":
|
531 |
+
if diversity_level == "low":
|
532 |
+
return "Likeheart Village", "A large group where opinions converge."
|
533 |
+
elif diversity_level == "medium":
|
534 |
+
return "Harmonious Assembly", "A significant gathering with mostly aligned views."
|
535 |
+
else: # high diversity
|
536 |
+
return "Vibrant Forum", "A large, active group with diverse perspectives."
|
537 |
+
elif significance_level == "medium":
|
538 |
+
if diversity_level == "low":
|
539 |
+
return "Quiet Consensus", "A moderately sized group with little disagreement."
|
540 |
+
elif diversity_level == "medium":
|
541 |
+
return "Mixed Opinions", "A balanced group with varied viewpoints."
|
542 |
+
else: # high diversity
|
543 |
+
return "Lively Discussion", "A moderately sized group with strong, differing opinions."
|
544 |
+
else: # low significance
|
545 |
+
if diversity_level == "low":
|
546 |
+
return "Echo Chamber Nook", "A small corner where similar thoughts resonate."
|
547 |
+
elif diversity_level == "medium":
|
548 |
+
return "Scattered Thoughts", "A small group with somewhat varied, isolated views."
|
549 |
+
else: # high diversity
|
550 |
+
return "Whispering Gallery", "A small group where many different ideas are quietly shared."
|
551 |
+
|
552 |
+
except Exception as e:
|
553 |
+
print(f"Error naming user group for topic {topic_id} and users {user_ids}: {e}")
|
554 |
+
return "Mysterious Gathering", "An error occurred while trying to name this group." # Default name and description on error
|
555 |
+
finally:
|
556 |
+
if local_con:
|
557 |
+
local_con.close()
|
558 |
+
|
559 |
+
|
560 |
# Helper function to get a random unvoted comment
|
561 |
def get_random_unvoted_comment(user_id, topic_id):
|
562 |
+
new_area_comments = st.session_state.get("_new_area_comments", [])
|
563 |
+
if len(new_area_comments) != 0:
|
564 |
+
value = new_area_comments.pop()
|
565 |
+
st.session_state._new_area_comments = new_area_comments
|
566 |
+
return value[0], value[1]
|
567 |
local_con = None
|
568 |
try:
|
569 |
local_con = duckdb.connect(database=DB_PATH, read_only=False)
|
|
|
574 |
""", [topic_id]).fetchone()[0]
|
575 |
|
576 |
if comment_count == 0:
|
577 |
+
return None, "Share your insight!"
|
578 |
|
579 |
# Attempt to get a random comment that the user has NOT voted on
|
580 |
result = local_con.execute("""
|
|
|
588 |
ORDER BY RANDOM()
|
589 |
LIMIT 1
|
590 |
""", [topic_id, user_id]).fetchone()
|
|
|
591 |
if result:
|
592 |
+
# Check for cluster change and set message flag
|
593 |
+
current_label, current_users = get_user_cluster_label(user_id, topic_id)
|
594 |
+
current_users_set = set(current_users)
|
595 |
+
|
596 |
+
previous_label = st.session_state.get('_previous_cluster_label')
|
597 |
+
previous_users_set = st.session_state.get('_previous_cluster_users_set', set())
|
598 |
+
|
599 |
+
# Check if cluster label has changed AND the set of users in the new cluster is different
|
600 |
+
# This indicates the user has moved to a different group of commenters
|
601 |
+
if current_label is not None and previous_label is not None and current_label != previous_label:
|
602 |
+
if current_users_set != previous_users_set:
|
603 |
+
# Set a flag in session state to display the message later in the main rendering logic
|
604 |
+
st.session_state._show_new_area_message = True
|
605 |
+
new_area_comments = get_top_k_polarized_comments_for_users(current_users_set, k=5)
|
606 |
+
st.session_state._new_area_comments = new_area_comments
|
607 |
+
# print(f"DEBUG: Cluster changed for user {user_id} in topic {topic_id}: {previous_label} -> {current_label}")
|
608 |
+
# print(f"DEBUG: Previous users count: {len(previous_users_set)}, Current users count: {len(current_users_set)}")
|
609 |
+
st.session_state._previous_cluster_label = current_label
|
610 |
+
st.session_state._previous_cluster_users_set = current_users_set
|
611 |
+
|
612 |
# Found an unvoted comment
|
613 |
return result[0], result[1]
|
614 |
else:
|
|
|
709 |
st.title("Welcome to SteamPolis")
|
710 |
st.markdown("Choose an option:")
|
711 |
|
712 |
+
if st.button("Create New Topic (Quest)"):
|
713 |
st.session_state.page = 'create_topic'
|
714 |
st.rerun()
|
715 |
|
716 |
st.markdown("---")
|
717 |
+
st.markdown("Or join an existing topic (quest):")
|
718 |
topic_input = st.text_input("Enter Topic ID or URL")
|
719 |
|
720 |
if st.button("Join Topic"):
|
|
|
797 |
current_comment_id = st.session_state.get('current_comment_id')
|
798 |
current_comment_content = st.session_state.get('current_comment_content', "Loading comments...")
|
799 |
comment_history = st.session_state.get('comment_history', "")
|
800 |
+
show_new_area_message = st.session_state.get('_show_new_area_message', True)
|
801 |
|
802 |
if not topic_id:
|
803 |
st.warning("No topic selected. Returning to home.")
|
|
|
832 |
|
833 |
|
834 |
# Include functional information
|
835 |
+
st.markdown(f"**Shareable Quest Scroll ID:** `{topic_id}`")
|
836 |
# Construct shareable link using current app URL
|
837 |
app_url = st.query_params.get('base', ['http://localhost:8501/'])[0] # Get base URL if available
|
838 |
shareable_link = f"{app_url}?topic={topic_id}" if app_url else f"?topic={topic_id}"
|
|
|
926 |
]
|
927 |
# Randomly select a phrase
|
928 |
random_phrase = random.choice(intro_phrases)
|
929 |
+
st.markdown(comment_history)
|
930 |
|
931 |
if current_comment_id: # Only show voting if there's a comment to vote on
|
932 |
# Display comment history and the current comment with the random intro
|
933 |
+
if show_new_area_message == True:
|
934 |
+
_, user_ids = get_user_cluster_label(user_id)
|
935 |
+
new_area_name, desc = name_user_group(user_ids, topic_id)
|
936 |
+
st.markdown(f"You've collected {len(comment_history.splitlines())} insights so far.")
|
937 |
+
st.markdown(f"And yet a new place you have arrived: `{new_area_name}`. {desc}")
|
938 |
+
st.session_state._show_new_area_message = False
|
939 |
+
st.markdown(f"[Collected new insight, {random_phrase}]:\n* {current_comment_content}")
|
940 |
|
941 |
# Handle vote logic
|
942 |
def handle_vote(vote_type, comment_id, topic_id, user_id):
|
943 |
+
# Add JavaScript to scroll to the bottom anchor after the page reloads
|
944 |
+
# This script will be included in the next render cycle triggered by st.rerun()
|
945 |
+
# Ensure an element with id="bottom" exists in the rendered page,
|
946 |
+
# typically placed after the content you want to scroll to (e.g., comment history).
|
947 |
local_con = None
|
948 |
try:
|
949 |
local_con = duckdb.connect(database=DB_PATH, read_only=False)
|
950 |
+
# Use INSERT OR REPLACE INTO or ON CONFLICT DO UPDATE to handle repeat votes
|
951 |
+
# The UNIQUE constraint on (user_id, comment_id) in the votes table
|
952 |
+
# allows us to update the existing vote if one already exists for this user/comment pair.
|
953 |
+
# We generate a new UUID for the 'id' column, but it will only be used
|
954 |
+
# if this is a new insert. If it's an update, the existing 'id' is kept.
|
955 |
+
vote_id = str(uuid.uuid4()) # Generate a new UUID for the potential insert
|
956 |
local_con.execute("""
|
957 |
INSERT INTO votes (id, user_id, comment_id, vote_type)
|
958 |
VALUES (?, ?, ?, ?)
|
959 |
+
ON CONFLICT (user_id, comment_id)
|
960 |
+
DO UPDATE SET
|
961 |
+
vote_type = excluded.vote_type, -- Update vote_type with the new value
|
962 |
+
created_at = current_localtimestamp(); -- Update timestamp to reflect the latest vote
|
963 |
""", [vote_id, user_id, comment_id, vote_type])
|
964 |
|
965 |
# Append voted comment to history
|
966 |
+
# Note: This appends the comment regardless of whether it was a new vote or an update.
|
967 |
+
# The history is a simple log, not a reflection of vote changes.
|
968 |
vote_text = "π" if vote_type == "agree" else "π" if vote_type == "disagree" else "π"
|
969 |
+
comment_history = st.session_state.comment_history.split("\n\n")
|
970 |
+
if len(comment_history) > 10:
|
971 |
+
comment_history = ["..."] + comment_history[-10:]
|
972 |
+
st.session_state.comment_history = "\n\n".join(comment_history)
|
973 |
st.session_state.comment_history += f"\n\n{vote_text} {current_comment_content}"
|
974 |
|
975 |
# Check vote count and trigger special event
|
|
|
977 |
if 'vote_count' not in st.session_state:
|
978 |
st.session_state.vote_count = 0
|
979 |
|
980 |
+
# Increment vote count only if it was a new vote or a change?
|
981 |
+
# The current logic increments on every button click. Let's keep that for now
|
982 |
+
# as it drives the special event trigger based on interaction frequency.
|
983 |
st.session_state.vote_count += 1
|
984 |
|
985 |
+
# Check if it's time for a potential special event (every 5 votes/interactions)
|
986 |
if st.session_state.vote_count % 5 == 0:
|
987 |
+
st.session_state.vote_count = 0 # Reset count after triggering
|
988 |
# 30% chance to trigger the special sharing event
|
989 |
if random.random() < 0.3:
|
990 |
prompts = [
|
|
|
992 |
"A letter arrives from the Emperor's office, requesting your personal insight on the matter. What counsel do you offer?",
|
993 |
"As you walk through the streets, people gather, eager to hear your thoughts on the Emperor's dilemma. What advice do you give?"
|
994 |
]
|
995 |
+
# Pass the current topic_id to share_wisdom if needed, though it's not currently used there.
|
996 |
share_wisdom(random.choice(prompts), allow_skip=True)
|
997 |
|
998 |
# Get next comment
|
999 |
+
# This should always get the next unvoted comment for the user in this topic.
|
1000 |
next_comment_id, next_comment_content = get_random_unvoted_comment(user_id, topic_id)
|
1001 |
st.session_state.current_comment_id = next_comment_id
|
1002 |
st.session_state.current_comment_content = next_comment_content
|
1003 |
|
1004 |
# Update progress
|
1005 |
+
# Update the user's progress to the next comment they should see.
|
1006 |
update_user_progress(user_id, topic_id, next_comment_id)
|
1007 |
|
1008 |
st.rerun() # Rerun to update UI
|
|
|
1053 |
st.session_state.current_comment_content = "Loading comments..."
|
1054 |
if 'comment_history' not in st.session_state:
|
1055 |
st.session_state.comment_history = ""
|
1056 |
+
if 'processed_url_params' not in st.session_state:
|
1057 |
+
st.session_state.processed_url_params = False # Add flag initialization
|
1058 |
|
1059 |
# Initialize the database on first run
|
1060 |
initialize_database()
|
1061 |
|
1062 |
# Handle initial load from URL query parameters
|
1063 |
+
# Process only once per session load using the flag
|
1064 |
query_params = st.query_params
|
1065 |
+
# Check for 'topic' param and if it hasn't been processed yet
|
1066 |
+
if 'topic' in query_params and not st.session_state.processed_url_params:
|
1067 |
+
topic_id_from_url = query_params.get('topic') # Use .get for safety
|
1068 |
+
if topic_id_from_url: # Check if topic_id is actually retrieved
|
1069 |
+
st.session_state.page = 'view_topic'
|
1070 |
+
st.session_state.current_topic_id = topic_id_from_url
|
1071 |
+
st.session_state.processed_url_params = True # Mark as processed
|
1072 |
+
# The view_topic_page will handle loading user/comment based on session_state.user_email
|
1073 |
+
st.rerun() # Rerun to apply the page change
|
1074 |
|
1075 |
|
1076 |
# Render the appropriate page based on session state
|