pentarosarium commited on
Commit
21d6a34
·
1 Parent(s): 8a76f7a
Files changed (1) hide show
  1. app.py +36 -23
app.py CHANGED
@@ -321,24 +321,23 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
321
  return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
322
 
323
  def main():
324
- st.title("кластеризуем новости v.1.10")
325
  st.write("Upload Excel file with columns: company, datetime, text")
326
 
327
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
328
 
329
  if uploaded_file:
330
  try:
331
- # First, let's look at the columns in the file
332
  df_original = pd.read_excel(uploaded_file, sheet_name='Публикации')
333
  st.write("Available columns:", df_original.columns.tolist())
334
 
335
  # Create working copy with required columns
336
  df = df_original.copy()
337
- # Assuming the order is fixed in the Excel file, adjust indices if needed
338
- text_column = df_original.columns[6] # Adjust if needed
339
- title_column = df_original.columns[5] # Adjust if needed
340
- datetime_column = df_original.columns[3] # Adjust if needed
341
- company_column = df_original.columns[0] # Adjust if needed
342
 
343
  df = df_original[[company_column, datetime_column, title_column, text_column]].copy()
344
  df.columns = ['company', 'datetime', 'title', 'text']
@@ -366,29 +365,44 @@ def main():
366
 
367
  processor = NewsProcessor(similarity_threshold, time_threshold)
368
  result_df = processor.process_news(dedup_df, progress_bar)
369
- st.success(f"Found {result_df['cluster_id'].nunique()} clusters")
370
 
371
- # Process clusters to keep only longest text from each cluster
372
- clusters_to_process = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
 
 
 
 
373
  indices_to_keep = []
374
 
375
- for cluster_id in clusters_to_process:
376
- cluster_rows = result_df[result_df['cluster_id'] == cluster_id]
377
- # Get original indices from cluster
378
- cluster_indices = cluster_rows.index
379
- # Find the index with longest text
380
- text_lengths = df_original.iloc[cluster_indices][text_column].str.len()
381
- longest_text_idx = cluster_indices[text_lengths.argmax()]
382
- indices_to_keep.append(longest_text_idx)
 
 
 
 
 
 
383
 
384
- # Add indices of rows that weren't in any cluster or were in single-row clusters
385
- non_clustered_indices = result_df[~result_df['cluster_id'].isin(clusters_to_process)].index
386
  indices_to_keep.extend(non_clustered_indices)
387
 
388
  # Create final declustered DataFrame
389
- declustered_df = df_original.iloc[indices_to_keep].copy()
390
 
391
- st.success(f"Kept {len(declustered_df)} news items after removing cluster duplicates")
 
 
 
 
 
 
392
 
393
  # Download buttons for all results
394
  st.subheader("Download Results")
@@ -413,7 +427,6 @@ def main():
413
 
414
  except Exception as e:
415
  st.error(f"Error: {str(e)}")
416
- # Print more detailed error information
417
  import traceback
418
  st.error(traceback.format_exc())
419
  finally:
 
321
  return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
322
 
323
  def main():
324
+ st.title("кластеризуем новости v.1.11")
325
  st.write("Upload Excel file with columns: company, datetime, text")
326
 
327
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
328
 
329
  if uploaded_file:
330
  try:
331
+ # Read all columns from original sheet
332
  df_original = pd.read_excel(uploaded_file, sheet_name='Публикации')
333
  st.write("Available columns:", df_original.columns.tolist())
334
 
335
  # Create working copy with required columns
336
  df = df_original.copy()
337
+ text_column = df_original.columns[6]
338
+ title_column = df_original.columns[5]
339
+ datetime_column = df_original.columns[3]
340
+ company_column = df_original.columns[0]
 
341
 
342
  df = df_original[[company_column, datetime_column, title_column, text_column]].copy()
343
  df.columns = ['company', 'datetime', 'title', 'text']
 
365
 
366
  processor = NewsProcessor(similarity_threshold, time_threshold)
367
  result_df = processor.process_news(dedup_df, progress_bar)
 
368
 
369
+ # Create a mapping between original indices and cluster information
370
+ index_to_cluster = pd.Series(0, index=df_original.index) # Default cluster 0 for non-clustered rows
371
+ for idx, row in result_df.iterrows():
372
+ index_to_cluster[idx] = row['cluster_id']
373
+
374
+ # Initialize list of indices to keep
375
  indices_to_keep = []
376
 
377
+ # Process each cluster
378
+ for cluster_id in result_df['cluster_id'].unique():
379
+ cluster_size = len(result_df[result_df['cluster_id'] == cluster_id])
380
+
381
+ if cluster_size > 1:
382
+ # For clusters with multiple items, keep only the one with longest text
383
+ cluster_rows = result_df[result_df['cluster_id'] == cluster_id]
384
+ cluster_indices = cluster_rows.index
385
+ text_lengths = df_original.iloc[cluster_indices][text_column].str.len()
386
+ longest_text_idx = cluster_indices[text_lengths.argmax()]
387
+ indices_to_keep.append(longest_text_idx)
388
+ else:
389
+ # For single-item clusters, keep the item
390
+ indices_to_keep.extend(result_df[result_df['cluster_id'] == cluster_id].index)
391
 
392
+ # Add all non-clustered rows (cluster_id = 0)
393
+ non_clustered_indices = df_original.index[~df_original.index.isin(result_df.index)]
394
  indices_to_keep.extend(non_clustered_indices)
395
 
396
  # Create final declustered DataFrame
397
+ declustered_df = df_original.iloc[sorted(indices_to_keep)].copy()
398
 
399
+ st.success(f"""
400
+ Processing results:
401
+ - Original rows: {len(df_original)}
402
+ - Rows in clusters: {len(result_df)}
403
+ - Multi-item clusters: {len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique())}
404
+ - Rows kept after declustering: {len(declustered_df)}
405
+ """)
406
 
407
  # Download buttons for all results
408
  st.subheader("Download Results")
 
427
 
428
  except Exception as e:
429
  st.error(f"Error: {str(e)}")
 
430
  import traceback
431
  st.error(traceback.format_exc())
432
  finally: