Spaces:
Sleeping
Sleeping
Commit
·
6f609e4
1
Parent(s):
21d6a34
1.12
Browse files
app.py
CHANGED
@@ -321,7 +321,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
|
|
321 |
return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
|
322 |
|
323 |
def main():
|
324 |
-
st.title("кластеризуем новости v.1.
|
325 |
st.write("Upload Excel file with columns: company, datetime, text")
|
326 |
|
327 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
@@ -356,20 +356,21 @@ def main():
|
|
356 |
try:
|
357 |
progress_bar = st.progress(0)
|
358 |
|
|
|
359 |
deduplicator = NewsDeduplicator(fuzzy_threshold)
|
360 |
dedup_df = deduplicator.deduplicate(df, progress_bar)
|
361 |
st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
|
362 |
|
363 |
-
|
364 |
-
|
365 |
|
366 |
-
|
367 |
-
|
|
|
368 |
|
369 |
-
#
|
370 |
-
|
371 |
-
|
372 |
-
index_to_cluster[idx] = row['cluster_id']
|
373 |
|
374 |
# Initialize list of indices to keep
|
375 |
indices_to_keep = []
|
@@ -382,26 +383,27 @@ def main():
|
|
382 |
# For clusters with multiple items, keep only the one with longest text
|
383 |
cluster_rows = result_df[result_df['cluster_id'] == cluster_id]
|
384 |
cluster_indices = cluster_rows.index
|
385 |
-
text_lengths =
|
386 |
longest_text_idx = cluster_indices[text_lengths.argmax()]
|
387 |
indices_to_keep.append(longest_text_idx)
|
388 |
else:
|
389 |
# For single-item clusters, keep the item
|
390 |
indices_to_keep.extend(result_df[result_df['cluster_id'] == cluster_id].index)
|
391 |
|
392 |
-
# Add all non-clustered rows
|
393 |
-
non_clustered_indices =
|
394 |
indices_to_keep.extend(non_clustered_indices)
|
395 |
|
396 |
-
# Create final declustered DataFrame
|
397 |
-
declustered_df =
|
398 |
|
399 |
st.success(f"""
|
400 |
Processing results:
|
401 |
- Original rows: {len(df_original)}
|
|
|
402 |
- Rows in clusters: {len(result_df)}
|
403 |
- Multi-item clusters: {len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique())}
|
404 |
-
-
|
405 |
""")
|
406 |
|
407 |
# Download buttons for all results
|
|
|
321 |
return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
|
322 |
|
323 |
def main():
|
324 |
+
st.title("кластеризуем новости v.1.12")
|
325 |
st.write("Upload Excel file with columns: company, datetime, text")
|
326 |
|
327 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
|
|
356 |
try:
|
357 |
progress_bar = st.progress(0)
|
358 |
|
359 |
+
# Step 1: Deduplicate
|
360 |
deduplicator = NewsDeduplicator(fuzzy_threshold)
|
361 |
dedup_df = deduplicator.deduplicate(df, progress_bar)
|
362 |
st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
|
363 |
|
364 |
+
# Preserve all columns from original DataFrame in dedup_df
|
365 |
+
dedup_df = df_original.loc[dedup_df.index].copy()
|
366 |
|
367 |
+
# Create working copy of dedup_df with required columns for clustering
|
368 |
+
working_df = dedup_df[[company_column, datetime_column, title_column, text_column]].copy()
|
369 |
+
working_df.columns = ['company', 'datetime', 'title', 'text']
|
370 |
|
371 |
+
# Step 2: Cluster deduplicated news
|
372 |
+
processor = NewsProcessor(similarity_threshold, time_threshold)
|
373 |
+
result_df = processor.process_news(working_df, progress_bar)
|
|
|
374 |
|
375 |
# Initialize list of indices to keep
|
376 |
indices_to_keep = []
|
|
|
383 |
# For clusters with multiple items, keep only the one with longest text
|
384 |
cluster_rows = result_df[result_df['cluster_id'] == cluster_id]
|
385 |
cluster_indices = cluster_rows.index
|
386 |
+
text_lengths = dedup_df.iloc[cluster_indices][text_column].str.len()
|
387 |
longest_text_idx = cluster_indices[text_lengths.argmax()]
|
388 |
indices_to_keep.append(longest_text_idx)
|
389 |
else:
|
390 |
# For single-item clusters, keep the item
|
391 |
indices_to_keep.extend(result_df[result_df['cluster_id'] == cluster_id].index)
|
392 |
|
393 |
+
# Add all non-clustered rows from dedup_df
|
394 |
+
non_clustered_indices = dedup_df.index[~dedup_df.index.isin(result_df.index)]
|
395 |
indices_to_keep.extend(non_clustered_indices)
|
396 |
|
397 |
+
# Create final declustered DataFrame from dedup_df
|
398 |
+
declustered_df = dedup_df.iloc[sorted(indices_to_keep)].copy()
|
399 |
|
400 |
st.success(f"""
|
401 |
Processing results:
|
402 |
- Original rows: {len(df_original)}
|
403 |
+
- After deduplication: {len(dedup_df)}
|
404 |
- Rows in clusters: {len(result_df)}
|
405 |
- Multi-item clusters: {len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique())}
|
406 |
+
- Final rows after declustering: {len(declustered_df)}
|
407 |
""")
|
408 |
|
409 |
# Download buttons for all results
|