pentarosarium commited on
Commit
67890fd
·
1 Parent(s): b3c96bc

1.22 print debug

Browse files
Files changed (1) hide show
  1. app.py +51 -79
app.py CHANGED
@@ -186,51 +186,30 @@ class NewsProcessor:
186
  if df.empty:
187
  return pd.DataFrame(columns=['cluster_id', 'datetime', 'company', 'relevance_score', 'text', 'cluster_size'])
188
 
189
- df = df.sort_values('datetime')
190
 
191
- # First, filter out news where the company isn't the main subject
192
- relevance_results = []
193
- for idx, row in df.iterrows():
194
- title = row['title'] if 'title' in row else ''
195
- is_main, score = self.is_company_main_subject(title, row['text'], row['company'])
196
- if is_main:
197
- relevance_results.append({
198
- 'idx': idx,
199
- 'relevance_score': score
200
- })
201
-
202
- if not relevance_results:
203
- return pd.DataFrame(columns=['cluster_id', 'datetime', 'company', 'relevance_score', 'text', 'cluster_size'])
204
-
205
- relevant_indices = [r['idx'] for r in relevance_results]
206
- relevance_scores = {r['idx']: r['relevance_score'] for r in relevance_results}
207
-
208
- df_filtered = df.loc[relevant_indices].copy()
209
- df_filtered['relevance_score'] = df_filtered.index.map(relevance_scores)
210
-
211
- # Continue with clustering logic...
212
  clusters = []
213
  processed = set()
214
 
215
- for i in tqdm(range(len(df_filtered)), total=len(df_filtered)):
216
- if i in processed:
217
  continue
218
 
219
- row1 = df_filtered.iloc[i]
220
- cluster = [df_filtered.index[i]]
221
- processed.add(i)
222
 
223
  if not pd.isna(row1['text']):
224
  text1_embedding = self.encode_text(row1['text'])
225
 
226
  if progress_bar:
227
- progress_bar.progress(len(processed) / len(df_filtered))
228
 
229
- for j in range(len(df_filtered)):
230
- if j in processed:
231
  continue
232
 
233
- row2 = df_filtered.iloc[j]
234
  if pd.isna(row2['text']):
235
  continue
236
 
@@ -242,12 +221,12 @@ class NewsProcessor:
242
  similarity = np.dot(text1_embedding, text2_embedding)
243
 
244
  if similarity >= self.similarity_threshold:
245
- cluster.append(df_filtered.index[j])
246
- processed.add(j)
247
 
248
  clusters.append(cluster)
249
 
250
- # Create result DataFrame
251
  result_data = []
252
  for cluster_id, cluster_indices in enumerate(clusters, 1):
253
  cluster_rows = df.loc[cluster_indices]
@@ -256,12 +235,12 @@ class NewsProcessor:
256
  'cluster_id': cluster_id,
257
  'datetime': df.loc[idx, 'datetime'],
258
  'company': df.loc[idx, 'company'],
259
- 'relevance_score': relevance_scores[idx],
260
  'text': df.loc[idx, 'text'],
261
  'cluster_size': len(cluster_indices)
262
  })
263
 
264
- return pd.DataFrame(result_data)
 
265
 
266
  class NewsDeduplicator:
267
  def __init__(self, fuzzy_threshold=85):
@@ -322,7 +301,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
322
 
323
 
324
  def main():
325
- st.title("кластеризуем новости v.1.21 print debug")
326
  st.write("Upload Excel file with columns: company, datetime, text")
327
 
328
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
@@ -360,61 +339,54 @@ def main():
360
  # Step 1: Deduplicate
361
  deduplicator = NewsDeduplicator(fuzzy_threshold)
362
  dedup_df = deduplicator.deduplicate(df, progress_bar)
363
- st.write("\nAfter deduplication:")
364
- st.write(f"dedup_df indices: {dedup_df.index.tolist()}")
365
- st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
366
-
367
- # Preserve all columns from original DataFrame in dedup_df
368
- dedup_df_full = df_original.loc[dedup_df.index].copy()
369
- st.write(f"dedup_df_full indices: {dedup_df_full.index.tolist()}")
370
 
371
  # Step 2: Cluster deduplicated news
372
  processor = NewsProcessor(similarity_threshold, time_threshold)
373
  result_df = processor.process_news(dedup_df, progress_bar)
374
- st.write("\nAfter clustering:")
375
- st.write(f"result_df indices: {result_df.index.tolist()}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
- # Initialize set of indices to delete
378
  indices_to_delete = set()
379
 
380
- # Find rows to delete from multi-item clusters
381
  if len(result_df) > 0:
382
- # Get all multi-item clusters
383
- multi_clusters = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
384
- st.write(f"\nMulti-clusters found: {multi_clusters.tolist()}")
385
-
386
- # For each multi-item cluster
387
- for cluster_id in multi_clusters:
388
- st.write(f"\nProcessing cluster {cluster_id}:")
389
- # Get rows in this cluster
390
  cluster_mask = result_df['cluster_id'] == cluster_id
391
- cluster_rows = result_df[cluster_mask]
392
- st.write(f"Cluster rows indices: {cluster_rows.index.tolist()}")
393
-
394
- # Get their original indices from dedup_df_full
395
- original_indices = dedup_df.index[cluster_rows.index]
396
-
397
- #original_indices = dedup_df_full.index[cluster_rows.index - 1] -it was wrong!
398
- st.write(f"Original indices: {original_indices.tolist()}")
399
-
400
- # Find the row with longest text among these indices
401
- text_lengths = dedup_df_full.loc[original_indices, text_column].fillna('').str.len()
402
- st.write(f"Text lengths: {text_lengths.to_dict()}")
403
- longest_text_idx = text_lengths.idxmax()
404
- st.write(f"Longest text index: {longest_text_idx}")
405
-
406
- # Add all other indices to delete set
407
- new_indices_to_delete = set(original_indices) - {longest_text_idx}
408
- indices_to_delete.update(new_indices_to_delete)
409
- st.write(f"Indices to delete from this cluster: {new_indices_to_delete}")
410
 
411
- st.write(f"\nFinal indices to delete: {sorted(list(indices_to_delete))}")
 
412
 
413
- # Create final declustered DataFrame by removing identified rows
414
- declustered_df = dedup_df_full.copy()
415
  if indices_to_delete:
416
  declustered_df = declustered_df.drop(index=list(indices_to_delete))
417
- st.write(f"\nFinal kept indices: {sorted(declustered_df.index.tolist())}")
 
418
 
419
 
420
  # Print statistics
 
186
  if df.empty:
187
  return pd.DataFrame(columns=['cluster_id', 'datetime', 'company', 'relevance_score', 'text', 'cluster_size'])
188
 
189
+ df = df.copy() # Make a copy to preserve original indices
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  clusters = []
192
  processed = set()
193
 
194
+ for idx in df.index: # Iterate over original indices
195
+ if idx in processed:
196
  continue
197
 
198
+ row1 = df.loc[idx]
199
+ cluster = [idx] # Store original index
200
+ processed.add(idx)
201
 
202
  if not pd.isna(row1['text']):
203
  text1_embedding = self.encode_text(row1['text'])
204
 
205
  if progress_bar:
206
+ progress_bar.progress(len(processed) / len(df))
207
 
208
+ for other_idx in df.index: # Iterate over original indices
209
+ if other_idx in processed:
210
  continue
211
 
212
+ row2 = df.loc[other_idx]
213
  if pd.isna(row2['text']):
214
  continue
215
 
 
221
  similarity = np.dot(text1_embedding, text2_embedding)
222
 
223
  if similarity >= self.similarity_threshold:
224
+ cluster.append(other_idx)
225
+ processed.add(other_idx)
226
 
227
  clusters.append(cluster)
228
 
229
+ # Create result DataFrame preserving original indices
230
  result_data = []
231
  for cluster_id, cluster_indices in enumerate(clusters, 1):
232
  cluster_rows = df.loc[cluster_indices]
 
235
  'cluster_id': cluster_id,
236
  'datetime': df.loc[idx, 'datetime'],
237
  'company': df.loc[idx, 'company'],
 
238
  'text': df.loc[idx, 'text'],
239
  'cluster_size': len(cluster_indices)
240
  })
241
 
242
+ result_df = pd.DataFrame(result_data, index=sum(clusters, [])) # Use original indices
243
+ return result_df
244
 
245
  class NewsDeduplicator:
246
  def __init__(self, fuzzy_threshold=85):
 
301
 
302
 
303
  def main():
304
+ st.title("кластеризуем новости v.1.22 + print debug")
305
  st.write("Upload Excel file with columns: company, datetime, text")
306
 
307
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
 
339
  # Step 1: Deduplicate
340
  deduplicator = NewsDeduplicator(fuzzy_threshold)
341
  dedup_df = deduplicator.deduplicate(df, progress_bar)
342
+ st.write("\nDeduplication Results:")
343
+ st.write(f"Original indices: {df.index.tolist()}")
344
+ st.write(f"Dedup indices: {dedup_df.index.tolist()}")
345
+ st.write(f"Sample from dedup_df:")
346
+ st.write(dedup_df[['company', 'text']].head())
 
 
347
 
348
  # Step 2: Cluster deduplicated news
349
  processor = NewsProcessor(similarity_threshold, time_threshold)
350
  result_df = processor.process_news(dedup_df, progress_bar)
351
+ st.write("\nClustering Results:")
352
+ st.write(f"Result df indices: {result_df.index.tolist()}")
353
+
354
+ # Display cluster information
355
+ if len(result_df) > 0:
356
+ st.write("\nCluster Details:")
357
+ for cluster_id in result_df['cluster_id'].unique():
358
+ cluster_mask = result_df['cluster_id'] == cluster_id
359
+ if sum(cluster_mask) > 1: # Only show multi-item clusters
360
+ cluster_indices = result_df[cluster_mask].index.tolist()
361
+ st.write(f"\nCluster {cluster_id}:")
362
+ st.write(f"Indices: {cluster_indices}")
363
+ # Show texts for verification
364
+ for idx in cluster_indices:
365
+ text_length = len(str(dedup_df.loc[idx, 'text']))
366
+ st.write(f"Index {idx} - Length {text_length}:")
367
+ st.write(str(dedup_df.loc[idx, 'text'])[:100] + '...')
368
 
369
+ # Process clusters for deletion
370
  indices_to_delete = set()
371
 
 
372
  if len(result_df) > 0:
373
+ for cluster_id in result_df['cluster_id'].unique():
 
 
 
 
 
 
 
374
  cluster_mask = result_df['cluster_id'] == cluster_id
375
+ if sum(cluster_mask) > 1:
376
+ cluster_indices = result_df[cluster_mask].index.tolist()
377
+ text_lengths = dedup_df.loc[cluster_indices, 'text'].fillna('').str.len()
378
+ longest_text_idx = text_lengths.idxmax()
379
+ indices_to_delete.update(set(cluster_indices) - {longest_text_idx})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
 
381
+ st.write("\nDeletion Summary:")
382
+ st.write(f"Indices to delete: {sorted(list(indices_to_delete))}")
383
 
384
+ # Create final DataFrame
385
+ declustered_df = dedup_df.copy()
386
  if indices_to_delete:
387
  declustered_df = declustered_df.drop(index=list(indices_to_delete))
388
+
389
+ st.write(f"Final indices kept: {sorted(declustered_df.index.tolist())}")
390
 
391
 
392
  # Print statistics