pentarosarium commited on
Commit
f06b820
·
1 Parent(s): ccf48e2

1.19 print debug

Browse files
Files changed (1) hide show
  1. app.py +19 -2
app.py CHANGED
@@ -322,7 +322,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
322
 
323
 
324
  def main():
325
- st.title("кластеризуем новости v.1.19")
326
  st.write("Upload Excel file with columns: company, datetime, text")
327
 
328
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
@@ -360,14 +360,19 @@ def main():
360
  # Step 1: Deduplicate
361
  deduplicator = NewsDeduplicator(fuzzy_threshold)
362
  dedup_df = deduplicator.deduplicate(df, progress_bar)
 
 
363
  st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
364
 
365
  # Preserve all columns from original DataFrame in dedup_df
366
  dedup_df_full = df_original.loc[dedup_df.index].copy()
 
367
 
368
  # Step 2: Cluster deduplicated news
369
  processor = NewsProcessor(similarity_threshold, time_threshold)
370
  result_df = processor.process_news(dedup_df, progress_bar)
 
 
371
 
372
  # Initialize set of indices to delete
373
  indices_to_delete = set()
@@ -376,27 +381,39 @@ def main():
376
  if len(result_df) > 0:
377
  # Get all multi-item clusters
378
  multi_clusters = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
 
379
 
380
  # For each multi-item cluster
381
  for cluster_id in multi_clusters:
 
382
  # Get rows in this cluster
383
  cluster_mask = result_df['cluster_id'] == cluster_id
384
  cluster_rows = result_df[cluster_mask]
 
385
 
386
  # Get their original indices from dedup_df_full
387
  original_indices = dedup_df_full.index[cluster_rows.index - 1]
 
388
 
389
  # Find the row with longest text among these indices
390
  text_lengths = dedup_df_full.loc[original_indices, text_column].fillna('').str.len()
 
391
  longest_text_idx = text_lengths.idxmax()
 
392
 
393
  # Add all other indices to delete set
394
- indices_to_delete.update(set(original_indices) - {longest_text_idx})
 
 
 
 
395
 
396
  # Create final declustered DataFrame by removing identified rows
397
  declustered_df = dedup_df_full.copy()
398
  if indices_to_delete:
399
  declustered_df = declustered_df.drop(index=list(indices_to_delete))
 
 
400
 
401
  # Print statistics
402
  st.success(f"""
 
322
 
323
 
324
  def main():
325
+ st.title("кластеризуем новости v.1.19 print debug")
326
  st.write("Upload Excel file with columns: company, datetime, text")
327
 
328
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
 
360
  # Step 1: Deduplicate
361
  deduplicator = NewsDeduplicator(fuzzy_threshold)
362
  dedup_df = deduplicator.deduplicate(df, progress_bar)
363
+ print("\nAfter deduplication:")
364
+ print(f"dedup_df indices: {dedup_df.index.tolist()}")
365
  st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
366
 
367
  # Preserve all columns from original DataFrame in dedup_df
368
  dedup_df_full = df_original.loc[dedup_df.index].copy()
369
+ print(f"dedup_df_full indices: {dedup_df_full.index.tolist()}")
370
 
371
  # Step 2: Cluster deduplicated news
372
  processor = NewsProcessor(similarity_threshold, time_threshold)
373
  result_df = processor.process_news(dedup_df, progress_bar)
374
+ print("\nAfter clustering:")
375
+ print(f"result_df indices: {result_df.index.tolist()}")
376
 
377
  # Initialize set of indices to delete
378
  indices_to_delete = set()
 
381
  if len(result_df) > 0:
382
  # Get all multi-item clusters
383
  multi_clusters = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
384
+ print(f"\nMulti-clusters found: {multi_clusters.tolist()}")
385
 
386
  # For each multi-item cluster
387
  for cluster_id in multi_clusters:
388
+ print(f"\nProcessing cluster {cluster_id}:")
389
  # Get rows in this cluster
390
  cluster_mask = result_df['cluster_id'] == cluster_id
391
  cluster_rows = result_df[cluster_mask]
392
+ print(f"Cluster rows indices: {cluster_rows.index.tolist()}")
393
 
394
  # Get their original indices from dedup_df_full
395
  original_indices = dedup_df_full.index[cluster_rows.index - 1]
396
+ print(f"Original indices: {original_indices.tolist()}")
397
 
398
  # Find the row with longest text among these indices
399
  text_lengths = dedup_df_full.loc[original_indices, text_column].fillna('').str.len()
400
+ print(f"Text lengths: {text_lengths.to_dict()}")
401
  longest_text_idx = text_lengths.idxmax()
402
+ print(f"Longest text index: {longest_text_idx}")
403
 
404
  # Add all other indices to delete set
405
+ new_indices_to_delete = set(original_indices) - {longest_text_idx}
406
+ indices_to_delete.update(new_indices_to_delete)
407
+ print(f"Indices to delete from this cluster: {new_indices_to_delete}")
408
+
409
+ print(f"\nFinal indices to delete: {sorted(list(indices_to_delete))}")
410
 
411
  # Create final declustered DataFrame by removing identified rows
412
  declustered_df = dedup_df_full.copy()
413
  if indices_to_delete:
414
  declustered_df = declustered_df.drop(index=list(indices_to_delete))
415
+ print(f"\nFinal kept indices: {sorted(declustered_df.index.tolist())}")
416
+
417
 
418
  # Print statistics
419
  st.success(f"""