Spaces:
Sleeping
Sleeping
Commit
·
8b2771c
1
Parent(s):
6f609e4
1.13
Browse files
app.py
CHANGED
@@ -321,7 +321,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
|
|
321 |
return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
|
322 |
|
323 |
def main():
|
324 |
-
st.title("кластеризуем новости v.1.
|
325 |
st.write("Upload Excel file with columns: company, datetime, text")
|
326 |
|
327 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
@@ -362,45 +362,42 @@ def main():
|
|
362 |
st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
|
363 |
|
364 |
# Preserve all columns from original DataFrame in dedup_df
|
365 |
-
|
366 |
-
|
367 |
-
# Create working copy of dedup_df with required columns for clustering
|
368 |
-
working_df = dedup_df[[company_column, datetime_column, title_column, text_column]].copy()
|
369 |
-
working_df.columns = ['company', 'datetime', 'title', 'text']
|
370 |
|
371 |
# Step 2: Cluster deduplicated news
|
372 |
processor = NewsProcessor(similarity_threshold, time_threshold)
|
373 |
-
result_df = processor.process_news(
|
374 |
|
375 |
-
# Initialize
|
376 |
-
indices_to_keep =
|
377 |
|
378 |
# Process each cluster
|
379 |
for cluster_id in result_df['cluster_id'].unique():
|
380 |
-
|
|
|
381 |
|
382 |
if cluster_size > 1:
|
383 |
# For clusters with multiple items, keep only the one with longest text
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
indices_to_keep.append(longest_text_idx)
|
389 |
else:
|
390 |
# For single-item clusters, keep the item
|
391 |
-
indices_to_keep.
|
392 |
|
393 |
# Add all non-clustered rows from dedup_df
|
394 |
-
|
395 |
-
|
|
|
396 |
|
397 |
# Create final declustered DataFrame from dedup_df
|
398 |
-
declustered_df =
|
399 |
|
400 |
st.success(f"""
|
401 |
Processing results:
|
402 |
- Original rows: {len(df_original)}
|
403 |
-
- After deduplication: {len(
|
404 |
- Rows in clusters: {len(result_df)}
|
405 |
- Multi-item clusters: {len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique())}
|
406 |
- Final rows after declustering: {len(declustered_df)}
|
@@ -408,7 +405,7 @@ def main():
|
|
408 |
|
409 |
# Download buttons for all results
|
410 |
st.subheader("Download Results")
|
411 |
-
st.markdown(create_download_link(
|
412 |
st.markdown(create_download_link(result_df, "clustered_news.xlsx"), unsafe_allow_html=True)
|
413 |
st.markdown(create_download_link(declustered_df, "declustered_news.xlsx"), unsafe_allow_html=True)
|
414 |
|
|
|
321 |
return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
|
322 |
|
323 |
def main():
|
324 |
+
st.title("кластеризуем новости v.1.13")
|
325 |
st.write("Upload Excel file with columns: company, datetime, text")
|
326 |
|
327 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
|
|
362 |
st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
|
363 |
|
364 |
# Preserve all columns from original DataFrame in dedup_df
|
365 |
+
dedup_df_full = df_original.loc[dedup_df.index].copy()
|
|
|
|
|
|
|
|
|
366 |
|
367 |
# Step 2: Cluster deduplicated news
|
368 |
processor = NewsProcessor(similarity_threshold, time_threshold)
|
369 |
+
result_df = processor.process_news(dedup_df, progress_bar)
|
370 |
|
371 |
+
# Initialize set of indices to keep
|
372 |
+
indices_to_keep = set()
|
373 |
|
374 |
# Process each cluster
|
375 |
for cluster_id in result_df['cluster_id'].unique():
|
376 |
+
cluster_mask = result_df['cluster_id'] == cluster_id
|
377 |
+
cluster_size = cluster_mask.sum()
|
378 |
|
379 |
if cluster_size > 1:
|
380 |
# For clusters with multiple items, keep only the one with longest text
|
381 |
+
cluster_indices = result_df[cluster_mask].index
|
382 |
+
text_lengths = dedup_df_full.loc[cluster_indices, text_column].str.len()
|
383 |
+
longest_text_idx = text_lengths.idxmax()
|
384 |
+
indices_to_keep.add(longest_text_idx)
|
|
|
385 |
else:
|
386 |
# For single-item clusters, keep the item
|
387 |
+
indices_to_keep.update(result_df[cluster_mask].index)
|
388 |
|
389 |
# Add all non-clustered rows from dedup_df
|
390 |
+
clustered_indices = set(result_df.index)
|
391 |
+
non_clustered_indices = set(dedup_df_full.index) - clustered_indices
|
392 |
+
indices_to_keep.update(non_clustered_indices)
|
393 |
|
394 |
# Create final declustered DataFrame from dedup_df
|
395 |
+
declustered_df = dedup_df_full.loc[list(indices_to_keep)].copy()
|
396 |
|
397 |
st.success(f"""
|
398 |
Processing results:
|
399 |
- Original rows: {len(df_original)}
|
400 |
+
- After deduplication: {len(dedup_df_full)}
|
401 |
- Rows in clusters: {len(result_df)}
|
402 |
- Multi-item clusters: {len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique())}
|
403 |
- Final rows after declustering: {len(declustered_df)}
|
|
|
405 |
|
406 |
# Download buttons for all results
|
407 |
st.subheader("Download Results")
|
408 |
+
st.markdown(create_download_link(dedup_df_full, "deduplicated_news.xlsx"), unsafe_allow_html=True)
|
409 |
st.markdown(create_download_link(result_df, "clustered_news.xlsx"), unsafe_allow_html=True)
|
410 |
st.markdown(create_download_link(declustered_df, "declustered_news.xlsx"), unsafe_allow_html=True)
|
411 |
|