pentarosarium commited on
Commit
e032dc3
·
1 Parent(s): 30c87ad
Files changed (1) hide show
  1. app.py +29 -40
app.py CHANGED
@@ -322,7 +322,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
322
 
323
 
324
  def main():
325
- st.title("кластеризуем новости v.1.15")
326
  st.write("Upload Excel file with columns: company, datetime, text")
327
 
328
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
@@ -362,69 +362,58 @@ def main():
362
  dedup_df = deduplicator.deduplicate(df, progress_bar)
363
  st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
364
 
365
- # Get working copy of deduplicated data with all columns
366
  dedup_df_full = df_original.loc[dedup_df.index].copy()
367
 
368
- # Create working copy for clustering with required columns
369
- working_df = dedup_df_full[[company_column, datetime_column, title_column, text_column]].copy()
370
- working_df.columns = ['company', 'datetime', 'title', 'text']
371
-
372
  # Step 2: Cluster deduplicated news
373
  processor = NewsProcessor(similarity_threshold, time_threshold)
374
- result_df = processor.process_news(working_df, progress_bar)
375
 
376
- # Create a dictionary to store indices to delete for each cluster
377
  indices_to_delete = set()
378
 
379
- # Process clusters with more than 1 member
380
- if not result_df.empty:
381
- for cluster_id in result_df['cluster_id'].unique():
382
- cluster_rows = result_df[result_df['cluster_id'] == cluster_id]
 
 
 
 
 
 
 
 
383
 
384
- if len(cluster_rows) > 1: # Only process multi-member clusters
385
- # Get indices of all rows in this cluster
386
- cluster_indices = cluster_rows.index
387
-
388
- # Find the row with the longest text
389
- text_lengths = working_df.loc[cluster_indices, 'text'].str.len()
390
- longest_text_idx = text_lengths.idxmax()
391
-
392
- # Add all other indices from this cluster to deletion set
393
- cluster_indices_to_delete = set(cluster_indices) - {longest_text_idx}
394
- indices_to_delete.update(cluster_indices_to_delete)
395
 
396
- # Create final declustered DataFrame by dropping the identified rows
397
  declustered_df = dedup_df_full.copy()
398
  if indices_to_delete:
399
  declustered_df = declustered_df.drop(index=list(indices_to_delete))
400
 
401
- # Print statistics for verification
402
  st.success(f"""
403
  Processing results:
404
  - Original rows: {len(df_original)}
405
  - After deduplication: {len(dedup_df_full)}
406
- - Rows in clusters (any size): {len(result_df) if not result_df.empty else 0}
407
- - Rows removed from multi-member clusters: {len(indices_to_delete)}
408
- - Final rows after declustering: {len(declustered_df)}
409
  """)
410
 
411
- # Add debugging information
412
- if not result_df.empty:
413
- multi_clusters = len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique())
414
- st.write(f"Number of multi-member clusters found: {multi_clusters}")
415
-
416
- # Show cluster sizes
417
- cluster_sizes = result_df['cluster_size'].value_counts().sort_index()
418
- st.write("Cluster size distribution:")
419
- st.write(cluster_sizes)
420
-
421
  # Download buttons for all results
422
  st.subheader("Download Results")
423
  st.markdown(create_download_link(dedup_df_full, "deduplicated_news.xlsx"), unsafe_allow_html=True)
 
424
  st.markdown(create_download_link(declustered_df, "declustered_news.xlsx"), unsafe_allow_html=True)
425
 
426
- # Show cluster statistics if clusters were found
427
- if not result_df.empty:
428
  st.subheader("Largest Clusters")
429
  largest_clusters = result_df[result_df['cluster_size'] > 1].sort_values(
430
  ['cluster_size', 'cluster_id', 'datetime'],
 
322
 
323
 
324
  def main():
325
+ st.title("кластеризуем новости v.1.16")
326
  st.write("Upload Excel file with columns: company, datetime, text")
327
 
328
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
 
362
  dedup_df = deduplicator.deduplicate(df, progress_bar)
363
  st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
364
 
365
+ # Preserve all columns from original DataFrame in dedup_df
366
  dedup_df_full = df_original.loc[dedup_df.index].copy()
367
 
 
 
 
 
368
  # Step 2: Cluster deduplicated news
369
  processor = NewsProcessor(similarity_threshold, time_threshold)
370
+ result_df = processor.process_news(dedup_df, progress_bar)
371
 
372
+ # Initialize set of indices to delete
373
  indices_to_delete = set()
374
 
375
+ # Find rows to delete from multi-item clusters
376
+ if len(result_df) > 0:
377
+ # Get all multi-item clusters
378
+ multi_clusters = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
379
+
380
+ # For each multi-item cluster
381
+ for cluster_id in multi_clusters:
382
+ # Get indices of all rows in this cluster
383
+ cluster_indices = result_df[result_df['cluster_id'] == cluster_id].index.tolist()
384
+
385
+ # Get their text lengths
386
+ text_lengths = dedup_df_full.loc[cluster_indices, text_column].fillna('').str.len()
387
 
388
+ # Find index with longest text
389
+ longest_text_idx = text_lengths.idxmax()
390
+
391
+ # Add all other indices to delete set
392
+ indices_to_delete.update(set(cluster_indices) - {longest_text_idx})
 
 
 
 
 
 
393
 
394
+ # Create final declustered DataFrame by removing identified rows
395
  declustered_df = dedup_df_full.copy()
396
  if indices_to_delete:
397
  declustered_df = declustered_df.drop(index=list(indices_to_delete))
398
 
399
+ # Print statistics
400
  st.success(f"""
401
  Processing results:
402
  - Original rows: {len(df_original)}
403
  - After deduplication: {len(dedup_df_full)}
404
+ - Multi-item clusters found: {len(multi_clusters) if len(result_df) > 0 else 0}
405
+ - Rows removed from clusters: {len(indices_to_delete)}
406
+ - Final rows kept: {len(declustered_df)}
407
  """)
408
 
 
 
 
 
 
 
 
 
 
 
409
  # Download buttons for all results
410
  st.subheader("Download Results")
411
  st.markdown(create_download_link(dedup_df_full, "deduplicated_news.xlsx"), unsafe_allow_html=True)
412
+ st.markdown(create_download_link(result_df, "clustered_news.xlsx"), unsafe_allow_html=True)
413
  st.markdown(create_download_link(declustered_df, "declustered_news.xlsx"), unsafe_allow_html=True)
414
 
415
+ # Show clusters info
416
+ if len(result_df) > 0:
417
  st.subheader("Largest Clusters")
418
  largest_clusters = result_df[result_df['cluster_size'] > 1].sort_values(
419
  ['cluster_size', 'cluster_id', 'datetime'],