pentarosarium commited on
Commit
25558c7
·
1 Parent(s): 67890fd

1.23 print debug

Browse files
Files changed (1) hide show
  1. app.py +7 -4
app.py CHANGED
@@ -301,7 +301,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
301
 
302
 
303
  def main():
304
- st.title("кластеризуем новости v.1.22 + print debug")
305
  st.write("Upload Excel file with columns: company, datetime, text")
306
 
307
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
@@ -339,6 +339,10 @@ def main():
339
  # Step 1: Deduplicate
340
  deduplicator = NewsDeduplicator(fuzzy_threshold)
341
  dedup_df = deduplicator.deduplicate(df, progress_bar)
 
 
 
 
342
  st.write("\nDeduplication Results:")
343
  st.write(f"Original indices: {df.index.tolist()}")
344
  st.write(f"Dedup indices: {dedup_df.index.tolist()}")
@@ -382,19 +386,18 @@ def main():
382
  st.write(f"Indices to delete: {sorted(list(indices_to_delete))}")
383
 
384
  # Create final DataFrame
385
- declustered_df = dedup_df.copy()
386
  if indices_to_delete:
387
  declustered_df = declustered_df.drop(index=list(indices_to_delete))
388
 
389
  st.write(f"Final indices kept: {sorted(declustered_df.index.tolist())}")
390
-
391
 
392
  # Print statistics
393
  st.success(f"""
394
  Processing results:
395
  - Original rows: {len(df_original)}
396
  - After deduplication: {len(dedup_df_full)}
397
- - Multi-item clusters found: {len(multi_clusters) if len(result_df) > 0 else 0}
398
  - Rows removed from clusters: {len(indices_to_delete)}
399
  - Final rows kept: {len(declustered_df)}
400
  """)
 
301
 
302
 
303
  def main():
304
+ st.title("кластеризуем новости v.1.23 + print debug")
305
  st.write("Upload Excel file with columns: company, datetime, text")
306
 
307
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
 
339
  # Step 1: Deduplicate
340
  deduplicator = NewsDeduplicator(fuzzy_threshold)
341
  dedup_df = deduplicator.deduplicate(df, progress_bar)
342
+
343
+ # Preserve all columns from original DataFrame in dedup_df_full
344
+ dedup_df_full = df_original.loc[dedup_df.index].copy()
345
+
346
  st.write("\nDeduplication Results:")
347
  st.write(f"Original indices: {df.index.tolist()}")
348
  st.write(f"Dedup indices: {dedup_df.index.tolist()}")
 
386
  st.write(f"Indices to delete: {sorted(list(indices_to_delete))}")
387
 
388
  # Create final DataFrame
389
+ declustered_df = dedup_df_full.copy()
390
  if indices_to_delete:
391
  declustered_df = declustered_df.drop(index=list(indices_to_delete))
392
 
393
  st.write(f"Final indices kept: {sorted(declustered_df.index.tolist())}")
 
394
 
395
  # Print statistics
396
  st.success(f"""
397
  Processing results:
398
  - Original rows: {len(df_original)}
399
  - After deduplication: {len(dedup_df_full)}
400
+ - Multi-item clusters found: {len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()) if len(result_df) > 0 else 0}
401
  - Rows removed from clusters: {len(indices_to_delete)}
402
  - Final rows kept: {len(declustered_df)}
403
  """)