Spaces:
Sleeping
Sleeping
Commit
·
e032dc3
1
Parent(s):
30c87ad
1.16
Browse files
app.py
CHANGED
@@ -322,7 +322,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
|
|
322 |
|
323 |
|
324 |
def main():
|
325 |
-
st.title("кластеризуем новости v.1.
|
326 |
st.write("Upload Excel file with columns: company, datetime, text")
|
327 |
|
328 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
@@ -362,69 +362,58 @@ def main():
|
|
362 |
dedup_df = deduplicator.deduplicate(df, progress_bar)
|
363 |
st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
|
364 |
|
365 |
-
#
|
366 |
dedup_df_full = df_original.loc[dedup_df.index].copy()
|
367 |
|
368 |
-
# Create working copy for clustering with required columns
|
369 |
-
working_df = dedup_df_full[[company_column, datetime_column, title_column, text_column]].copy()
|
370 |
-
working_df.columns = ['company', 'datetime', 'title', 'text']
|
371 |
-
|
372 |
# Step 2: Cluster deduplicated news
|
373 |
processor = NewsProcessor(similarity_threshold, time_threshold)
|
374 |
-
result_df = processor.process_news(
|
375 |
|
376 |
-
#
|
377 |
indices_to_delete = set()
|
378 |
|
379 |
-
#
|
380 |
-
if
|
381 |
-
|
382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
383 |
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
text_lengths = working_df.loc[cluster_indices, 'text'].str.len()
|
390 |
-
longest_text_idx = text_lengths.idxmax()
|
391 |
-
|
392 |
-
# Add all other indices from this cluster to deletion set
|
393 |
-
cluster_indices_to_delete = set(cluster_indices) - {longest_text_idx}
|
394 |
-
indices_to_delete.update(cluster_indices_to_delete)
|
395 |
|
396 |
-
# Create final declustered DataFrame by
|
397 |
declustered_df = dedup_df_full.copy()
|
398 |
if indices_to_delete:
|
399 |
declustered_df = declustered_df.drop(index=list(indices_to_delete))
|
400 |
|
401 |
-
# Print statistics
|
402 |
st.success(f"""
|
403 |
Processing results:
|
404 |
- Original rows: {len(df_original)}
|
405 |
- After deduplication: {len(dedup_df_full)}
|
406 |
-
-
|
407 |
-
- Rows removed from
|
408 |
-
- Final rows
|
409 |
""")
|
410 |
|
411 |
-
# Add debugging information
|
412 |
-
if not result_df.empty:
|
413 |
-
multi_clusters = len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique())
|
414 |
-
st.write(f"Number of multi-member clusters found: {multi_clusters}")
|
415 |
-
|
416 |
-
# Show cluster sizes
|
417 |
-
cluster_sizes = result_df['cluster_size'].value_counts().sort_index()
|
418 |
-
st.write("Cluster size distribution:")
|
419 |
-
st.write(cluster_sizes)
|
420 |
-
|
421 |
# Download buttons for all results
|
422 |
st.subheader("Download Results")
|
423 |
st.markdown(create_download_link(dedup_df_full, "deduplicated_news.xlsx"), unsafe_allow_html=True)
|
|
|
424 |
st.markdown(create_download_link(declustered_df, "declustered_news.xlsx"), unsafe_allow_html=True)
|
425 |
|
426 |
-
# Show
|
427 |
-
if
|
428 |
st.subheader("Largest Clusters")
|
429 |
largest_clusters = result_df[result_df['cluster_size'] > 1].sort_values(
|
430 |
['cluster_size', 'cluster_id', 'datetime'],
|
|
|
322 |
|
323 |
|
324 |
def main():
|
325 |
+
st.title("кластеризуем новости v.1.16")
|
326 |
st.write("Upload Excel file with columns: company, datetime, text")
|
327 |
|
328 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
|
|
362 |
dedup_df = deduplicator.deduplicate(df, progress_bar)
|
363 |
st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
|
364 |
|
365 |
+
# Preserve all columns from original DataFrame in dedup_df
|
366 |
dedup_df_full = df_original.loc[dedup_df.index].copy()
|
367 |
|
|
|
|
|
|
|
|
|
368 |
# Step 2: Cluster deduplicated news
|
369 |
processor = NewsProcessor(similarity_threshold, time_threshold)
|
370 |
+
result_df = processor.process_news(dedup_df, progress_bar)
|
371 |
|
372 |
+
# Initialize set of indices to delete
|
373 |
indices_to_delete = set()
|
374 |
|
375 |
+
# Find rows to delete from multi-item clusters
|
376 |
+
if len(result_df) > 0:
|
377 |
+
# Get all multi-item clusters
|
378 |
+
multi_clusters = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
|
379 |
+
|
380 |
+
# For each multi-item cluster
|
381 |
+
for cluster_id in multi_clusters:
|
382 |
+
# Get indices of all rows in this cluster
|
383 |
+
cluster_indices = result_df[result_df['cluster_id'] == cluster_id].index.tolist()
|
384 |
+
|
385 |
+
# Get their text lengths
|
386 |
+
text_lengths = dedup_df_full.loc[cluster_indices, text_column].fillna('').str.len()
|
387 |
|
388 |
+
# Find index with longest text
|
389 |
+
longest_text_idx = text_lengths.idxmax()
|
390 |
+
|
391 |
+
# Add all other indices to delete set
|
392 |
+
indices_to_delete.update(set(cluster_indices) - {longest_text_idx})
|
|
|
|
|
|
|
|
|
|
|
|
|
393 |
|
394 |
+
# Create final declustered DataFrame by removing identified rows
|
395 |
declustered_df = dedup_df_full.copy()
|
396 |
if indices_to_delete:
|
397 |
declustered_df = declustered_df.drop(index=list(indices_to_delete))
|
398 |
|
399 |
+
# Print statistics
|
400 |
st.success(f"""
|
401 |
Processing results:
|
402 |
- Original rows: {len(df_original)}
|
403 |
- After deduplication: {len(dedup_df_full)}
|
404 |
+
- Multi-item clusters found: {len(multi_clusters) if len(result_df) > 0 else 0}
|
405 |
+
- Rows removed from clusters: {len(indices_to_delete)}
|
406 |
+
- Final rows kept: {len(declustered_df)}
|
407 |
""")
|
408 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
409 |
# Download buttons for all results
|
410 |
st.subheader("Download Results")
|
411 |
st.markdown(create_download_link(dedup_df_full, "deduplicated_news.xlsx"), unsafe_allow_html=True)
|
412 |
+
st.markdown(create_download_link(result_df, "clustered_news.xlsx"), unsafe_allow_html=True)
|
413 |
st.markdown(create_download_link(declustered_df, "declustered_news.xlsx"), unsafe_allow_html=True)
|
414 |
|
415 |
+
# Show clusters info
|
416 |
+
if len(result_df) > 0:
|
417 |
st.subheader("Largest Clusters")
|
418 |
largest_clusters = result_df[result_df['cluster_size'] > 1].sort_values(
|
419 |
['cluster_size', 'cluster_id', 'datetime'],
|