pentarosarium commited on
Commit
6f609e4
·
1 Parent(s): 21d6a34
Files changed (1) hide show
  1. app.py +17 -15
app.py CHANGED
@@ -321,7 +321,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
321
  return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
322
 
323
  def main():
324
- st.title("кластеризуем новости v.1.11")
325
  st.write("Upload Excel file with columns: company, datetime, text")
326
 
327
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
@@ -356,20 +356,21 @@ def main():
356
  try:
357
  progress_bar = st.progress(0)
358
 
 
359
  deduplicator = NewsDeduplicator(fuzzy_threshold)
360
  dedup_df = deduplicator.deduplicate(df, progress_bar)
361
  st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
362
 
363
- st.write("Sample of deduplicated data:")
364
- st.dataframe(dedup_df[['datetime', 'company', 'text', 'company_count', 'duplicate_count']].head())
365
 
366
- processor = NewsProcessor(similarity_threshold, time_threshold)
367
- result_df = processor.process_news(dedup_df, progress_bar)
 
368
 
369
- # Create a mapping between original indices and cluster information
370
- index_to_cluster = pd.Series(0, index=df_original.index) # Default cluster 0 for non-clustered rows
371
- for idx, row in result_df.iterrows():
372
- index_to_cluster[idx] = row['cluster_id']
373
 
374
  # Initialize list of indices to keep
375
  indices_to_keep = []
@@ -382,26 +383,27 @@ def main():
382
  # For clusters with multiple items, keep only the one with longest text
383
  cluster_rows = result_df[result_df['cluster_id'] == cluster_id]
384
  cluster_indices = cluster_rows.index
385
- text_lengths = df_original.iloc[cluster_indices][text_column].str.len()
386
  longest_text_idx = cluster_indices[text_lengths.argmax()]
387
  indices_to_keep.append(longest_text_idx)
388
  else:
389
  # For single-item clusters, keep the item
390
  indices_to_keep.extend(result_df[result_df['cluster_id'] == cluster_id].index)
391
 
392
- # Add all non-clustered rows (cluster_id = 0)
393
- non_clustered_indices = df_original.index[~df_original.index.isin(result_df.index)]
394
  indices_to_keep.extend(non_clustered_indices)
395
 
396
- # Create final declustered DataFrame
397
- declustered_df = df_original.iloc[sorted(indices_to_keep)].copy()
398
 
399
  st.success(f"""
400
  Processing results:
401
  - Original rows: {len(df_original)}
 
402
  - Rows in clusters: {len(result_df)}
403
  - Multi-item clusters: {len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique())}
404
- - Rows kept after declustering: {len(declustered_df)}
405
  """)
406
 
407
  # Download buttons for all results
 
321
  return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
322
 
323
  def main():
324
+ st.title("кластеризуем новости v.1.12")
325
  st.write("Upload Excel file with columns: company, datetime, text")
326
 
327
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
 
356
  try:
357
  progress_bar = st.progress(0)
358
 
359
+ # Step 1: Deduplicate
360
  deduplicator = NewsDeduplicator(fuzzy_threshold)
361
  dedup_df = deduplicator.deduplicate(df, progress_bar)
362
  st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
363
 
364
+ # Preserve all columns from original DataFrame in dedup_df
365
+ dedup_df = df_original.loc[dedup_df.index].copy()
366
 
367
+ # Create working copy of dedup_df with required columns for clustering
368
+ working_df = dedup_df[[company_column, datetime_column, title_column, text_column]].copy()
369
+ working_df.columns = ['company', 'datetime', 'title', 'text']
370
 
371
+ # Step 2: Cluster deduplicated news
372
+ processor = NewsProcessor(similarity_threshold, time_threshold)
373
+ result_df = processor.process_news(working_df, progress_bar)
 
374
 
375
  # Initialize list of indices to keep
376
  indices_to_keep = []
 
383
  # For clusters with multiple items, keep only the one with longest text
384
  cluster_rows = result_df[result_df['cluster_id'] == cluster_id]
385
  cluster_indices = cluster_rows.index
386
+ text_lengths = dedup_df.iloc[cluster_indices][text_column].str.len()
387
  longest_text_idx = cluster_indices[text_lengths.argmax()]
388
  indices_to_keep.append(longest_text_idx)
389
  else:
390
  # For single-item clusters, keep the item
391
  indices_to_keep.extend(result_df[result_df['cluster_id'] == cluster_id].index)
392
 
393
+ # Add all non-clustered rows from dedup_df
394
+ non_clustered_indices = dedup_df.index[~dedup_df.index.isin(result_df.index)]
395
  indices_to_keep.extend(non_clustered_indices)
396
 
397
+ # Create final declustered DataFrame from dedup_df
398
+ declustered_df = dedup_df.iloc[sorted(indices_to_keep)].copy()
399
 
400
  st.success(f"""
401
  Processing results:
402
  - Original rows: {len(df_original)}
403
+ - After deduplication: {len(dedup_df)}
404
  - Rows in clusters: {len(result_df)}
405
  - Multi-item clusters: {len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique())}
406
+ - Final rows after declustering: {len(declustered_df)}
407
  """)
408
 
409
  # Download buttons for all results