pentarosarium commited on
Commit
8b2771c
·
1 Parent(s): 6f609e4
Files changed (1) hide show
  1. app.py +18 -21
app.py CHANGED
@@ -321,7 +321,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
321
  return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
322
 
323
  def main():
324
- st.title("кластеризуем новости v.1.12")
325
  st.write("Upload Excel file with columns: company, datetime, text")
326
 
327
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
@@ -362,45 +362,42 @@ def main():
362
  st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
363
 
364
  # Preserve all columns from original DataFrame in dedup_df
365
- dedup_df = df_original.loc[dedup_df.index].copy()
366
-
367
- # Create working copy of dedup_df with required columns for clustering
368
- working_df = dedup_df[[company_column, datetime_column, title_column, text_column]].copy()
369
- working_df.columns = ['company', 'datetime', 'title', 'text']
370
 
371
  # Step 2: Cluster deduplicated news
372
  processor = NewsProcessor(similarity_threshold, time_threshold)
373
- result_df = processor.process_news(working_df, progress_bar)
374
 
375
- # Initialize list of indices to keep
376
- indices_to_keep = []
377
 
378
  # Process each cluster
379
  for cluster_id in result_df['cluster_id'].unique():
380
- cluster_size = len(result_df[result_df['cluster_id'] == cluster_id])
 
381
 
382
  if cluster_size > 1:
383
  # For clusters with multiple items, keep only the one with longest text
384
- cluster_rows = result_df[result_df['cluster_id'] == cluster_id]
385
- cluster_indices = cluster_rows.index
386
- text_lengths = dedup_df.iloc[cluster_indices][text_column].str.len()
387
- longest_text_idx = cluster_indices[text_lengths.argmax()]
388
- indices_to_keep.append(longest_text_idx)
389
  else:
390
  # For single-item clusters, keep the item
391
- indices_to_keep.extend(result_df[result_df['cluster_id'] == cluster_id].index)
392
 
393
  # Add all non-clustered rows from dedup_df
394
- non_clustered_indices = dedup_df.index[~dedup_df.index.isin(result_df.index)]
395
- indices_to_keep.extend(non_clustered_indices)
 
396
 
397
  # Create final declustered DataFrame from dedup_df
398
- declustered_df = dedup_df.iloc[sorted(indices_to_keep)].copy()
399
 
400
  st.success(f"""
401
  Processing results:
402
  - Original rows: {len(df_original)}
403
- - After deduplication: {len(dedup_df)}
404
  - Rows in clusters: {len(result_df)}
405
  - Multi-item clusters: {len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique())}
406
  - Final rows after declustering: {len(declustered_df)}
@@ -408,7 +405,7 @@ def main():
408
 
409
  # Download buttons for all results
410
  st.subheader("Download Results")
411
- st.markdown(create_download_link(dedup_df, "deduplicated_news.xlsx"), unsafe_allow_html=True)
412
  st.markdown(create_download_link(result_df, "clustered_news.xlsx"), unsafe_allow_html=True)
413
  st.markdown(create_download_link(declustered_df, "declustered_news.xlsx"), unsafe_allow_html=True)
414
 
 
321
  return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
322
 
323
  def main():
324
+ st.title("кластеризуем новости v.1.13")
325
  st.write("Upload Excel file with columns: company, datetime, text")
326
 
327
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
 
362
  st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
363
 
364
  # Preserve all columns from original DataFrame in dedup_df
365
+ dedup_df_full = df_original.loc[dedup_df.index].copy()
 
 
 
 
366
 
367
  # Step 2: Cluster deduplicated news
368
  processor = NewsProcessor(similarity_threshold, time_threshold)
369
+ result_df = processor.process_news(dedup_df, progress_bar)
370
 
371
+ # Initialize set of indices to keep
372
+ indices_to_keep = set()
373
 
374
  # Process each cluster
375
  for cluster_id in result_df['cluster_id'].unique():
376
+ cluster_mask = result_df['cluster_id'] == cluster_id
377
+ cluster_size = cluster_mask.sum()
378
 
379
  if cluster_size > 1:
380
  # For clusters with multiple items, keep only the one with longest text
381
+ cluster_indices = result_df[cluster_mask].index
382
+ text_lengths = dedup_df_full.loc[cluster_indices, text_column].str.len()
383
+ longest_text_idx = text_lengths.idxmax()
384
+ indices_to_keep.add(longest_text_idx)
 
385
  else:
386
  # For single-item clusters, keep the item
387
+ indices_to_keep.update(result_df[cluster_mask].index)
388
 
389
  # Add all non-clustered rows from dedup_df
390
+ clustered_indices = set(result_df.index)
391
+ non_clustered_indices = set(dedup_df_full.index) - clustered_indices
392
+ indices_to_keep.update(non_clustered_indices)
393
 
394
  # Create final declustered DataFrame from dedup_df
395
+ declustered_df = dedup_df_full.loc[list(indices_to_keep)].copy()
396
 
397
  st.success(f"""
398
  Processing results:
399
  - Original rows: {len(df_original)}
400
+ - After deduplication: {len(dedup_df_full)}
401
  - Rows in clusters: {len(result_df)}
402
  - Multi-item clusters: {len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique())}
403
  - Final rows after declustering: {len(declustered_df)}
 
405
 
406
  # Download buttons for all results
407
  st.subheader("Download Results")
408
+ st.markdown(create_download_link(dedup_df_full, "deduplicated_news.xlsx"), unsafe_allow_html=True)
409
  st.markdown(create_download_link(result_df, "clustered_news.xlsx"), unsafe_allow_html=True)
410
  st.markdown(create_download_link(declustered_df, "declustered_news.xlsx"), unsafe_allow_html=True)
411