pentarosarium commited on
Commit
ac7c699
·
1 Parent(s): f06b820

1.20 print debug

Browse files
Files changed (1) hide show
  1. app.py +15 -15
app.py CHANGED
@@ -322,7 +322,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
322
 
323
 
324
  def main():
325
- st.title("кластеризуем новости v.1.19 print debug")
326
  st.write("Upload Excel file with columns: company, datetime, text")
327
 
328
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
@@ -360,19 +360,19 @@ def main():
360
  # Step 1: Deduplicate
361
  deduplicator = NewsDeduplicator(fuzzy_threshold)
362
  dedup_df = deduplicator.deduplicate(df, progress_bar)
363
- print("\nAfter deduplication:")
364
- print(f"dedup_df indices: {dedup_df.index.tolist()}")
365
  st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
366
 
367
  # Preserve all columns from original DataFrame in dedup_df
368
  dedup_df_full = df_original.loc[dedup_df.index].copy()
369
- print(f"dedup_df_full indices: {dedup_df_full.index.tolist()}")
370
 
371
  # Step 2: Cluster deduplicated news
372
  processor = NewsProcessor(similarity_threshold, time_threshold)
373
  result_df = processor.process_news(dedup_df, progress_bar)
374
- print("\nAfter clustering:")
375
- print(f"result_df indices: {result_df.index.tolist()}")
376
 
377
  # Initialize set of indices to delete
378
  indices_to_delete = set()
@@ -381,38 +381,38 @@ def main():
381
  if len(result_df) > 0:
382
  # Get all multi-item clusters
383
  multi_clusters = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
384
- print(f"\nMulti-clusters found: {multi_clusters.tolist()}")
385
 
386
  # For each multi-item cluster
387
  for cluster_id in multi_clusters:
388
- print(f"\nProcessing cluster {cluster_id}:")
389
  # Get rows in this cluster
390
  cluster_mask = result_df['cluster_id'] == cluster_id
391
  cluster_rows = result_df[cluster_mask]
392
- print(f"Cluster rows indices: {cluster_rows.index.tolist()}")
393
 
394
  # Get their original indices from dedup_df_full
395
  original_indices = dedup_df_full.index[cluster_rows.index - 1]
396
- print(f"Original indices: {original_indices.tolist()}")
397
 
398
  # Find the row with longest text among these indices
399
  text_lengths = dedup_df_full.loc[original_indices, text_column].fillna('').str.len()
400
- print(f"Text lengths: {text_lengths.to_dict()}")
401
  longest_text_idx = text_lengths.idxmax()
402
- print(f"Longest text index: {longest_text_idx}")
403
 
404
  # Add all other indices to delete set
405
  new_indices_to_delete = set(original_indices) - {longest_text_idx}
406
  indices_to_delete.update(new_indices_to_delete)
407
- print(f"Indices to delete from this cluster: {new_indices_to_delete}")
408
 
409
- print(f"\nFinal indices to delete: {sorted(list(indices_to_delete))}")
410
 
411
  # Create final declustered DataFrame by removing identified rows
412
  declustered_df = dedup_df_full.copy()
413
  if indices_to_delete:
414
  declustered_df = declustered_df.drop(index=list(indices_to_delete))
415
- print(f"\nFinal kept indices: {sorted(declustered_df.index.tolist())}")
416
 
417
 
418
  # Print statistics
 
322
 
323
 
324
  def main():
325
+ st.title("кластеризуем новости v.1.20 print debug")
326
  st.write("Upload Excel file with columns: company, datetime, text")
327
 
328
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
 
360
  # Step 1: Deduplicate
361
  deduplicator = NewsDeduplicator(fuzzy_threshold)
362
  dedup_df = deduplicator.deduplicate(df, progress_bar)
363
+ st.write("\nAfter deduplication:")
364
+ st.write(f"dedup_df indices: {dedup_df.index.tolist()}")
365
  st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
366
 
367
  # Preserve all columns from original DataFrame in dedup_df
368
  dedup_df_full = df_original.loc[dedup_df.index].copy()
369
+ st.write(f"dedup_df_full indices: {dedup_df_full.index.tolist()}")
370
 
371
  # Step 2: Cluster deduplicated news
372
  processor = NewsProcessor(similarity_threshold, time_threshold)
373
  result_df = processor.process_news(dedup_df, progress_bar)
374
+ st.write("\nAfter clustering:")
375
+ st.write(f"result_df indices: {result_df.index.tolist()}")
376
 
377
  # Initialize set of indices to delete
378
  indices_to_delete = set()
 
381
  if len(result_df) > 0:
382
  # Get all multi-item clusters
383
  multi_clusters = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
384
+ st.write(f"\nMulti-clusters found: {multi_clusters.tolist()}")
385
 
386
  # For each multi-item cluster
387
  for cluster_id in multi_clusters:
388
+ st.write(f"\nProcessing cluster {cluster_id}:")
389
  # Get rows in this cluster
390
  cluster_mask = result_df['cluster_id'] == cluster_id
391
  cluster_rows = result_df[cluster_mask]
392
+ st.write(f"Cluster rows indices: {cluster_rows.index.tolist()}")
393
 
394
  # Get their original indices from dedup_df_full
395
  original_indices = dedup_df_full.index[cluster_rows.index - 1]
396
+ st.write(f"Original indices: {original_indices.tolist()}")
397
 
398
  # Find the row with longest text among these indices
399
  text_lengths = dedup_df_full.loc[original_indices, text_column].fillna('').str.len()
400
+ st.write(f"Text lengths: {text_lengths.to_dict()}")
401
  longest_text_idx = text_lengths.idxmax()
402
+ st.write(f"Longest text index: {longest_text_idx}")
403
 
404
  # Add all other indices to delete set
405
  new_indices_to_delete = set(original_indices) - {longest_text_idx}
406
  indices_to_delete.update(new_indices_to_delete)
407
+ st.write(f"Indices to delete from this cluster: {new_indices_to_delete}")
408
 
409
+ st.write(f"\nFinal indices to delete: {sorted(list(indices_to_delete))}")
410
 
411
  # Create final declustered DataFrame by removing identified rows
412
  declustered_df = dedup_df_full.copy()
413
  if indices_to_delete:
414
  declustered_df = declustered_df.drop(index=list(indices_to_delete))
415
+ st.write(f"\nFinal kept indices: {sorted(declustered_df.index.tolist())}")
416
 
417
 
418
  # Print statistics