pentarosarium commited on
Commit
ccf48e2
·
1 Parent(s): 75f4618
Files changed (1) hide show
  1. app.py +24 -30
app.py CHANGED
@@ -322,15 +322,15 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
322
 
323
 
324
  def main():
325
- st.title("кластеризуем новости v.1.17")
326
  st.write("Upload Excel file with columns: company, datetime, text")
327
 
328
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
329
 
330
  if uploaded_file:
331
  try:
332
- # Read all columns from original sheet and reset index
333
- df_original = pd.read_excel(uploaded_file, sheet_name='Публикации').reset_index(drop=True)
334
  st.write("Available columns:", df_original.columns.tolist())
335
 
336
  # Create working copy with required columns
@@ -360,56 +360,50 @@ def main():
360
  # Step 1: Deduplicate
361
  deduplicator = NewsDeduplicator(fuzzy_threshold)
362
  dedup_df = deduplicator.deduplicate(df, progress_bar)
363
- dedup_df = dedup_df.reset_index(drop=True) # Reset index after deduplication
364
  st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
365
 
366
- # Preserve all columns from original DataFrame in dedup_df and reset index
367
- dedup_df_full = df_original.loc[dedup_df.index].copy().reset_index(drop=True)
368
- dedup_df.index = dedup_df_full.index # Ensure indices match
369
 
370
  # Step 2: Cluster deduplicated news
371
  processor = NewsProcessor(similarity_threshold, time_threshold)
372
  result_df = processor.process_news(dedup_df, progress_bar)
373
 
 
 
 
 
374
  if len(result_df) > 0:
375
- # Ensure result_df index matches dedup_df_full
376
- result_df.index = dedup_df_full.index[result_df.index]
377
-
378
- # Initialize set of indices to delete
379
- indices_to_delete = set()
380
-
381
- # Find rows to delete from multi-item clusters
382
  multi_clusters = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
383
 
 
384
  for cluster_id in multi_clusters:
385
- # Get indices of all rows in this cluster
386
  cluster_mask = result_df['cluster_id'] == cluster_id
387
- cluster_indices = result_df[cluster_mask].index.tolist()
388
 
389
- # Get their text lengths
390
- text_lengths = dedup_df_full.loc[cluster_indices, text_column].fillna('').str.len()
391
 
392
- # Find index with longest text
 
393
  longest_text_idx = text_lengths.idxmax()
394
 
395
  # Add all other indices to delete set
396
- indices_to_delete.update(set(cluster_indices) - {longest_text_idx})
397
-
398
- # Create final declustered DataFrame by removing identified rows
399
- declustered_df = dedup_df_full.copy()
400
- if indices_to_delete:
401
- declustered_df = declustered_df.drop(index=list(indices_to_delete))
402
- else:
403
- declustered_df = dedup_df_full.copy()
404
- indices_to_delete = set()
405
- multi_clusters = []
406
 
407
  # Print statistics
408
  st.success(f"""
409
  Processing results:
410
  - Original rows: {len(df_original)}
411
  - After deduplication: {len(dedup_df_full)}
412
- - Multi-item clusters found: {len(multi_clusters)}
413
  - Rows removed from clusters: {len(indices_to_delete)}
414
  - Final rows kept: {len(declustered_df)}
415
  """)
 
322
 
323
 
324
  def main():
325
+ st.title("кластеризуем новости v.1.19")
326
  st.write("Upload Excel file with columns: company, datetime, text")
327
 
328
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
329
 
330
  if uploaded_file:
331
  try:
332
+ # Read all columns from original sheet
333
+ df_original = pd.read_excel(uploaded_file, sheet_name='Публикации')
334
  st.write("Available columns:", df_original.columns.tolist())
335
 
336
  # Create working copy with required columns
 
360
  # Step 1: Deduplicate
361
  deduplicator = NewsDeduplicator(fuzzy_threshold)
362
  dedup_df = deduplicator.deduplicate(df, progress_bar)
 
363
  st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
364
 
365
+ # Preserve all columns from original DataFrame in dedup_df
366
+ dedup_df_full = df_original.loc[dedup_df.index].copy()
 
367
 
368
  # Step 2: Cluster deduplicated news
369
  processor = NewsProcessor(similarity_threshold, time_threshold)
370
  result_df = processor.process_news(dedup_df, progress_bar)
371
 
372
+ # Initialize set of indices to delete
373
+ indices_to_delete = set()
374
+
375
+ # Find rows to delete from multi-item clusters
376
  if len(result_df) > 0:
377
+ # Get all multi-item clusters
 
 
 
 
 
 
378
  multi_clusters = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
379
 
380
+ # For each multi-item cluster
381
  for cluster_id in multi_clusters:
382
+ # Get rows in this cluster
383
  cluster_mask = result_df['cluster_id'] == cluster_id
384
+ cluster_rows = result_df[cluster_mask]
385
 
386
+ # Get their original indices from dedup_df_full
387
+ original_indices = dedup_df_full.index[cluster_rows.index - 1]
388
 
389
+ # Find the row with longest text among these indices
390
+ text_lengths = dedup_df_full.loc[original_indices, text_column].fillna('').str.len()
391
  longest_text_idx = text_lengths.idxmax()
392
 
393
  # Add all other indices to delete set
394
+ indices_to_delete.update(set(original_indices) - {longest_text_idx})
395
+
396
+ # Create final declustered DataFrame by removing identified rows
397
+ declustered_df = dedup_df_full.copy()
398
+ if indices_to_delete:
399
+ declustered_df = declustered_df.drop(index=list(indices_to_delete))
 
 
 
 
400
 
401
  # Print statistics
402
  st.success(f"""
403
  Processing results:
404
  - Original rows: {len(df_original)}
405
  - After deduplication: {len(dedup_df_full)}
406
+ - Multi-item clusters found: {len(multi_clusters) if len(result_df) > 0 else 0}
407
  - Rows removed from clusters: {len(indices_to_delete)}
408
  - Final rows kept: {len(declustered_df)}
409
  """)