npc0 commited on
Commit
2828463
·
verified ·
1 Parent(s): c005b55
Files changed (1) hide show
  1. src/streamlit_app.py +52 -31
src/streamlit_app.py CHANGED
@@ -320,7 +320,8 @@ def get_ttl_hash(seconds=360):
320
  def get_r_matrix_from_votes():
321
  local_con = None
322
  try:
323
- local_con = duckdb.connect(database=DB_PATH, read_only=True) # Read-only is sufficient
 
324
 
325
  # Fetch all vote data
326
  # fetchdf requires pandas
@@ -504,58 +505,71 @@ def get_user_cluster_label(user_id, ttl_hash=None):
504
 
505
 
506
  # Helper function to get top k most polarized comments for a list of users
507
- def get_top_k_polarized_comments_for_users(user_ids, k=5):
508
  """
509
- Retrieves the top k comments most agreed or disagreed upon (most polarized)
510
- by a given list of users.
 
 
 
 
511
 
512
  Args:
513
  user_ids (list[str]): A list of user IDs.
 
514
  k (int): The number of top comments to retrieve.
515
 
516
  Returns:
517
  list[tuple]: A list of tuples, where each tuple contains
518
- (comment_id, comment_content, average_vote_score),
519
- ordered by the absolute value of the average score descending.
520
  Returns an empty list if no votes are found for these users
521
- or on error.
522
  """
523
- if not user_ids:
524
- # print("Warning: get_top_k_polarized_comments_for_users called with empty user_ids list.") # Optional debug
525
- return [] # Cannot query without user IDs
 
526
 
527
  local_con = None
528
  try:
529
  local_con = duckdb.connect(database=DB_PATH, read_only=True)
530
 
531
- # Use parameterized query for the list of user IDs
532
  # DuckDB's Python API handles lists for IN clauses
533
  query = """
534
  SELECT
535
  v.comment_id,
536
  c.content,
537
- AVG(CASE
538
  WHEN v.vote_type = 'agree' THEN 1.0
539
  WHEN v.vote_type = 'neutral' THEN 0.0
540
  WHEN v.vote_type = 'disagree' THEN -1.0
541
  ELSE NULL -- Should not happen with current data
542
- END) as average_vote_score
 
543
  FROM votes v
544
  JOIN comments c ON v.comment_id = c.id
545
- WHERE v.user_id IN (?)
546
  GROUP BY v.comment_id, c.content
547
- HAVING COUNT(v.user_id) > 0 -- Ensure at least one user from the list voted on this comment
548
- ORDER BY ABS(average_vote_score) DESC
549
  LIMIT ?
550
  """
551
- # Pass the list of user_ids and k as parameters
552
- result = local_con.execute(query, [user_ids, k]).fetchall()
 
553
 
554
- return result
 
 
 
 
 
555
 
556
  except Exception as e:
557
  # st.error is not available here, just print or log
558
- print(f"Error getting top k polarized comments for users {user_ids}: {e}")
559
  return [] # Return empty list on error
560
  finally:
561
  if local_con:
@@ -591,17 +605,20 @@ def estimate_group_voting_diversity(user_ids, topic_id):
591
 
592
  # Get all votes for the given topic by the specified users
593
  # Join with comments to filter by topic_id
594
- query = """
 
 
595
  SELECT
596
  v.comment_id,
597
  v.user_id,
598
  v.vote_type
599
  FROM votes v
600
  JOIN comments c ON v.comment_id = c.id
601
- WHERE c.topic_id = ? AND v.user_id IN (?)
602
  """
603
- # DuckDB's Python API handles lists/tuples for IN clauses
604
- results = local_con.execute(query, [topic_id, user_ids_tuple]).fetchall()
 
605
 
606
  if not results:
607
  return 0.0 # No votes found for this group on this topic
@@ -643,7 +660,6 @@ def estimate_group_voting_diversity(user_ids, topic_id):
643
  if local_con:
644
  local_con.close()
645
 
646
-
647
  # Helper function to name a group of users based on their participation and voting diversity
648
  def name_user_group(user_ids, topic_id):
649
  """
@@ -805,8 +821,10 @@ def get_random_unvoted_comment(user_id, topic_id):
805
  if current_label is not None and previous_label is not None and current_label != previous_label:
806
  if current_users_set != previous_users_set:
807
  # Set a flag in session state to display the message later in the main rendering logic
 
 
808
  st.session_state._show_new_area_message = True
809
- new_area_comments = get_top_k_polarized_comments_for_users(current_users_set, k=5)
810
  st.session_state._new_area_comments = new_area_comments
811
  # print(f"DEBUG: Cluster changed for user {user_id} in topic {topic_id}: {previous_label} -> {current_label}")
812
  # print(f"DEBUG: Previous users count: {len(previous_users_set)}, Current users count: {len(current_users_set)}")
@@ -1157,7 +1175,7 @@ def view_topic_page():
1157
  st.markdown(random.choice(prompts))
1158
  new_comment_text = st.text_area("Your Insight that different from others above (Empty to skip)", key="tmp_new_comment_input")
1159
  st.session_state.handling_vote = True # lock
1160
- if st.button("Share Your Wisdom"):
1161
  if new_comment_text and len(new_comment_text.strip()):
1162
  user_email = st.session_state.get('user_email', '')
1163
  user_id = find_or_create_user(user_email) # Ensure user exists
@@ -1296,11 +1314,14 @@ if 'comment_history' not in st.session_state:
1296
  if 'processed_url_params' not in st.session_state:
1297
  st.session_state.processed_url_params = False # Add flag initialization
1298
 
1299
- # Initialize the database on first run
1300
- initialize_database()
1301
- if st.session_state.get('_add_dummy', True):
 
1302
  add_dummy_topic()
1303
- st.session_state._add_dummy = False
 
 
1304
 
1305
  # Handle initial load from URL query parameters
1306
  # Process only once per session load using the flag
 
320
  def get_r_matrix_from_votes():
321
  local_con = None
322
  try:
323
+ # Use read_only=False to maintain consistent configuration across all connections
324
+ local_con = duckdb.connect(database=DB_PATH, read_only=False)
325
 
326
  # Fetch all vote data
327
  # fetchdf requires pandas
 
505
 
506
 
507
  # Helper function to get top k most polarized comments for a list of users
508
+ def get_top_k_consensus_comments_for_users(user_ids, topic_id, k=5):
509
  """
510
+ Retrieves the top k comments with the highest voting consensus (lowest variance)
511
+ among a given list of users *for a specific topic*.
512
+
513
+ Consensus is measured by the population variance (VAR_POP) of numerical
514
+ vote scores (-1 for 'disagree', 0 for 'neutral', 1 for 'agree').
515
+ Lower variance indicates higher consensus.
516
 
517
  Args:
518
  user_ids (list[str]): A list of user IDs.
519
+ topic_id (str): The ID of the topic to filter comments by.
520
  k (int): The number of top comments to retrieve.
521
 
522
  Returns:
523
  list[tuple]: A list of tuples, where each tuple contains
524
+ (comment_id, comment_content, vote_variance),
525
+ ordered by vote_variance ascending (lowest variance first).
526
  Returns an empty list if no votes are found for these users
527
+ on this topic, or on error, or if the group has fewer than 2 users.
528
  """
529
+ if not user_ids or len(user_ids) < 2:
530
+ # Need at least 2 users from the group to calculate meaningful variance
531
+ # print("Warning: get_top_k_consensus_comments_for_users called with fewer than 2 user_ids.") # Optional debug
532
+ return [] # Cannot query without user IDs or with only one user
533
 
534
  local_con = None
535
  try:
536
  local_con = duckdb.connect(database=DB_PATH, read_only=True)
537
 
538
+ # Use parameterized query for the list of user IDs and topic ID
539
  # DuckDB's Python API handles lists for IN clauses
540
  query = """
541
  SELECT
542
  v.comment_id,
543
  c.content,
544
+ VAR_POP(CASE
545
  WHEN v.vote_type = 'agree' THEN 1.0
546
  WHEN v.vote_type = 'neutral' THEN 0.0
547
  WHEN v.vote_type = 'disagree' THEN -1.0
548
  ELSE NULL -- Should not happen with current data
549
+ END) as vote_variance,
550
+ COUNT(v.user_id) as num_votes_in_group -- Include count for potential tie-breaking
551
  FROM votes v
552
  JOIN comments c ON v.comment_id = c.id
553
+ WHERE v.user_id IN (?) AND c.topic_id = ? -- Filter by user IDs and topic ID
554
  GROUP BY v.comment_id, c.content
555
+ HAVING COUNT(v.user_id) >= 2 -- Ensure at least 2 users from the list voted on this comment
556
+ ORDER BY vote_variance ASC, num_votes_in_group DESC -- Order by lowest variance, then by number of votes (more votes = stronger consensus)
557
  LIMIT ?
558
  """
559
+ # Pass the list of user_ids, topic_id, and k as parameters
560
+ # DuckDB requires list parameters to be wrapped in a list/tuple for the execute method
561
+ result = local_con.execute(query, [user_ids, topic_id, k]).fetchall()
562
 
563
+ # The result includes comment_id, content, variance, and count.
564
+ # We only need comment_id, content, and variance for the return value as per docstring.
565
+ # The count was used for ordering.
566
+ formatted_result = [(row[0], row[1], row[2]) for row in result]
567
+
568
+ return formatted_result
569
 
570
  except Exception as e:
571
  # st.error is not available here, just print or log
572
+ print(f"Error getting top k consensus comments for users {user_ids} in topic {topic_id}: {e}")
573
  return [] # Return empty list on error
574
  finally:
575
  if local_con:
 
605
 
606
  # Get all votes for the given topic by the specified users
607
  # Join with comments to filter by topic_id
608
+ # Construct the IN clause dynamically to avoid the conversion error
609
+ placeholders = ', '.join(['?'] * len(user_ids_tuple))
610
+ query = f"""
611
  SELECT
612
  v.comment_id,
613
  v.user_id,
614
  v.vote_type
615
  FROM votes v
616
  JOIN comments c ON v.comment_id = c.id
617
+ WHERE c.topic_id = ? AND v.user_id IN ({placeholders})
618
  """
619
+ # Pass topic_id and then all user_ids as separate parameters
620
+ params = [topic_id] + list(user_ids_tuple) # Combine topic_id and user_ids
621
+ results = local_con.execute(query, params).fetchall()
622
 
623
  if not results:
624
  return 0.0 # No votes found for this group on this topic
 
660
  if local_con:
661
  local_con.close()
662
 
 
663
  # Helper function to name a group of users based on their participation and voting diversity
664
  def name_user_group(user_ids, topic_id):
665
  """
 
821
  if current_label is not None and previous_label is not None and current_label != previous_label:
822
  if current_users_set != previous_users_set:
823
  # Set a flag in session state to display the message later in the main rendering logic
824
+ print("st.session_state._show_new_area_message = True")
825
+ print("st.session_state._show_new_area_message = True")
826
  st.session_state._show_new_area_message = True
827
+ new_area_comments = get_top_k_consensus_comments_for_users(current_users_set, topic_id, k=5)
828
  st.session_state._new_area_comments = new_area_comments
829
  # print(f"DEBUG: Cluster changed for user {user_id} in topic {topic_id}: {previous_label} -> {current_label}")
830
  # print(f"DEBUG: Previous users count: {len(previous_users_set)}, Current users count: {len(current_users_set)}")
 
1175
  st.markdown(random.choice(prompts))
1176
  new_comment_text = st.text_area("Your Insight that different from others above (Empty to skip)", key="tmp_new_comment_input")
1177
  st.session_state.handling_vote = True # lock
1178
+ if st.button("Share Wisdom"):
1179
  if new_comment_text and len(new_comment_text.strip()):
1180
  user_email = st.session_state.get('user_email', '')
1181
  user_id = find_or_create_user(user_email) # Ensure user exists
 
1314
  if 'processed_url_params' not in st.session_state:
1315
  st.session_state.processed_url_params = False # Add flag initialization
1316
 
1317
+ # Initialize the database and add dummy data only once per session
1318
+ if st.session_state.get("db_initialized", False) is False:
1319
+ print("INFO: Initializing database and adding dummy data...") # Optional: Info message
1320
+ initialize_database()
1321
  add_dummy_topic()
1322
+ st.session_state.db_initialized = True
1323
+ print("INFO: Database initialization complete.") # Optional: Info message
1324
+
1325
 
1326
  # Handle initial load from URL query parameters
1327
  # Process only once per session load using the flag