Spaces:
Sleeping
Sleeping
Commit
·
ac7c699
1
Parent(s):
f06b820
1.20 print debug
Browse files
app.py
CHANGED
@@ -322,7 +322,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
|
|
322 |
|
323 |
|
324 |
def main():
|
325 |
-
st.title("кластеризуем новости v.1.
|
326 |
st.write("Upload Excel file with columns: company, datetime, text")
|
327 |
|
328 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
@@ -360,19 +360,19 @@ def main():
|
|
360 |
# Step 1: Deduplicate
|
361 |
deduplicator = NewsDeduplicator(fuzzy_threshold)
|
362 |
dedup_df = deduplicator.deduplicate(df, progress_bar)
|
363 |
-
|
364 |
-
|
365 |
st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
|
366 |
|
367 |
# Preserve all columns from original DataFrame in dedup_df
|
368 |
dedup_df_full = df_original.loc[dedup_df.index].copy()
|
369 |
-
|
370 |
|
371 |
# Step 2: Cluster deduplicated news
|
372 |
processor = NewsProcessor(similarity_threshold, time_threshold)
|
373 |
result_df = processor.process_news(dedup_df, progress_bar)
|
374 |
-
|
375 |
-
|
376 |
|
377 |
# Initialize set of indices to delete
|
378 |
indices_to_delete = set()
|
@@ -381,38 +381,38 @@ def main():
|
|
381 |
if len(result_df) > 0:
|
382 |
# Get all multi-item clusters
|
383 |
multi_clusters = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
|
384 |
-
|
385 |
|
386 |
# For each multi-item cluster
|
387 |
for cluster_id in multi_clusters:
|
388 |
-
|
389 |
# Get rows in this cluster
|
390 |
cluster_mask = result_df['cluster_id'] == cluster_id
|
391 |
cluster_rows = result_df[cluster_mask]
|
392 |
-
|
393 |
|
394 |
# Get their original indices from dedup_df_full
|
395 |
original_indices = dedup_df_full.index[cluster_rows.index - 1]
|
396 |
-
|
397 |
|
398 |
# Find the row with longest text among these indices
|
399 |
text_lengths = dedup_df_full.loc[original_indices, text_column].fillna('').str.len()
|
400 |
-
|
401 |
longest_text_idx = text_lengths.idxmax()
|
402 |
-
|
403 |
|
404 |
# Add all other indices to delete set
|
405 |
new_indices_to_delete = set(original_indices) - {longest_text_idx}
|
406 |
indices_to_delete.update(new_indices_to_delete)
|
407 |
-
|
408 |
|
409 |
-
|
410 |
|
411 |
# Create final declustered DataFrame by removing identified rows
|
412 |
declustered_df = dedup_df_full.copy()
|
413 |
if indices_to_delete:
|
414 |
declustered_df = declustered_df.drop(index=list(indices_to_delete))
|
415 |
-
|
416 |
|
417 |
|
418 |
# Print statistics
|
|
|
322 |
|
323 |
|
324 |
def main():
|
325 |
+
st.title("кластеризуем новости v.1.20 print debug")
|
326 |
st.write("Upload Excel file with columns: company, datetime, text")
|
327 |
|
328 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
|
|
360 |
# Step 1: Deduplicate
|
361 |
deduplicator = NewsDeduplicator(fuzzy_threshold)
|
362 |
dedup_df = deduplicator.deduplicate(df, progress_bar)
|
363 |
+
st.write("\nAfter deduplication:")
|
364 |
+
st.write(f"dedup_df indices: {dedup_df.index.tolist()}")
|
365 |
st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
|
366 |
|
367 |
# Preserve all columns from original DataFrame in dedup_df
|
368 |
dedup_df_full = df_original.loc[dedup_df.index].copy()
|
369 |
+
st.write(f"dedup_df_full indices: {dedup_df_full.index.tolist()}")
|
370 |
|
371 |
# Step 2: Cluster deduplicated news
|
372 |
processor = NewsProcessor(similarity_threshold, time_threshold)
|
373 |
result_df = processor.process_news(dedup_df, progress_bar)
|
374 |
+
st.write("\nAfter clustering:")
|
375 |
+
st.write(f"result_df indices: {result_df.index.tolist()}")
|
376 |
|
377 |
# Initialize set of indices to delete
|
378 |
indices_to_delete = set()
|
|
|
381 |
if len(result_df) > 0:
|
382 |
# Get all multi-item clusters
|
383 |
multi_clusters = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
|
384 |
+
st.write(f"\nMulti-clusters found: {multi_clusters.tolist()}")
|
385 |
|
386 |
# For each multi-item cluster
|
387 |
for cluster_id in multi_clusters:
|
388 |
+
st.write(f"\nProcessing cluster {cluster_id}:")
|
389 |
# Get rows in this cluster
|
390 |
cluster_mask = result_df['cluster_id'] == cluster_id
|
391 |
cluster_rows = result_df[cluster_mask]
|
392 |
+
st.write(f"Cluster rows indices: {cluster_rows.index.tolist()}")
|
393 |
|
394 |
# Get their original indices from dedup_df_full
|
395 |
original_indices = dedup_df_full.index[cluster_rows.index - 1]
|
396 |
+
st.write(f"Original indices: {original_indices.tolist()}")
|
397 |
|
398 |
# Find the row with longest text among these indices
|
399 |
text_lengths = dedup_df_full.loc[original_indices, text_column].fillna('').str.len()
|
400 |
+
st.write(f"Text lengths: {text_lengths.to_dict()}")
|
401 |
longest_text_idx = text_lengths.idxmax()
|
402 |
+
st.write(f"Longest text index: {longest_text_idx}")
|
403 |
|
404 |
# Add all other indices to delete set
|
405 |
new_indices_to_delete = set(original_indices) - {longest_text_idx}
|
406 |
indices_to_delete.update(new_indices_to_delete)
|
407 |
+
st.write(f"Indices to delete from this cluster: {new_indices_to_delete}")
|
408 |
|
409 |
+
st.write(f"\nFinal indices to delete: {sorted(list(indices_to_delete))}")
|
410 |
|
411 |
# Create final declustered DataFrame by removing identified rows
|
412 |
declustered_df = dedup_df_full.copy()
|
413 |
if indices_to_delete:
|
414 |
declustered_df = declustered_df.drop(index=list(indices_to_delete))
|
415 |
+
st.write(f"\nFinal kept indices: {sorted(declustered_df.index.tolist())}")
|
416 |
|
417 |
|
418 |
# Print statistics
|