Spaces:
Sleeping
Sleeping
Commit
·
21d6a34
1
Parent(s):
8a76f7a
1.11
Browse files
app.py
CHANGED
@@ -321,24 +321,23 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
|
|
321 |
return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
|
322 |
|
323 |
def main():
|
324 |
-
st.title("кластеризуем новости v.1.
|
325 |
st.write("Upload Excel file with columns: company, datetime, text")
|
326 |
|
327 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
328 |
|
329 |
if uploaded_file:
|
330 |
try:
|
331 |
-
#
|
332 |
df_original = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
333 |
st.write("Available columns:", df_original.columns.tolist())
|
334 |
|
335 |
# Create working copy with required columns
|
336 |
df = df_original.copy()
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
company_column = df_original.columns[0] # Adjust if needed
|
342 |
|
343 |
df = df_original[[company_column, datetime_column, title_column, text_column]].copy()
|
344 |
df.columns = ['company', 'datetime', 'title', 'text']
|
@@ -366,29 +365,44 @@ def main():
|
|
366 |
|
367 |
processor = NewsProcessor(similarity_threshold, time_threshold)
|
368 |
result_df = processor.process_news(dedup_df, progress_bar)
|
369 |
-
st.success(f"Found {result_df['cluster_id'].nunique()} clusters")
|
370 |
|
371 |
-
#
|
372 |
-
|
|
|
|
|
|
|
|
|
373 |
indices_to_keep = []
|
374 |
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
383 |
|
384 |
-
# Add
|
385 |
-
non_clustered_indices =
|
386 |
indices_to_keep.extend(non_clustered_indices)
|
387 |
|
388 |
# Create final declustered DataFrame
|
389 |
-
declustered_df = df_original.iloc[indices_to_keep].copy()
|
390 |
|
391 |
-
st.success(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
|
393 |
# Download buttons for all results
|
394 |
st.subheader("Download Results")
|
@@ -413,7 +427,6 @@ def main():
|
|
413 |
|
414 |
except Exception as e:
|
415 |
st.error(f"Error: {str(e)}")
|
416 |
-
# Print more detailed error information
|
417 |
import traceback
|
418 |
st.error(traceback.format_exc())
|
419 |
finally:
|
|
|
321 |
return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
|
322 |
|
323 |
def main():
|
324 |
+
st.title("кластеризуем новости v.1.11")
|
325 |
st.write("Upload Excel file with columns: company, datetime, text")
|
326 |
|
327 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
328 |
|
329 |
if uploaded_file:
|
330 |
try:
|
331 |
+
# Read all columns from original sheet
|
332 |
df_original = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
333 |
st.write("Available columns:", df_original.columns.tolist())
|
334 |
|
335 |
# Create working copy with required columns
|
336 |
df = df_original.copy()
|
337 |
+
text_column = df_original.columns[6]
|
338 |
+
title_column = df_original.columns[5]
|
339 |
+
datetime_column = df_original.columns[3]
|
340 |
+
company_column = df_original.columns[0]
|
|
|
341 |
|
342 |
df = df_original[[company_column, datetime_column, title_column, text_column]].copy()
|
343 |
df.columns = ['company', 'datetime', 'title', 'text']
|
|
|
365 |
|
366 |
processor = NewsProcessor(similarity_threshold, time_threshold)
|
367 |
result_df = processor.process_news(dedup_df, progress_bar)
|
|
|
368 |
|
369 |
+
# Create a mapping between original indices and cluster information
|
370 |
+
index_to_cluster = pd.Series(0, index=df_original.index) # Default cluster 0 for non-clustered rows
|
371 |
+
for idx, row in result_df.iterrows():
|
372 |
+
index_to_cluster[idx] = row['cluster_id']
|
373 |
+
|
374 |
+
# Initialize list of indices to keep
|
375 |
indices_to_keep = []
|
376 |
|
377 |
+
# Process each cluster
|
378 |
+
for cluster_id in result_df['cluster_id'].unique():
|
379 |
+
cluster_size = len(result_df[result_df['cluster_id'] == cluster_id])
|
380 |
+
|
381 |
+
if cluster_size > 1:
|
382 |
+
# For clusters with multiple items, keep only the one with longest text
|
383 |
+
cluster_rows = result_df[result_df['cluster_id'] == cluster_id]
|
384 |
+
cluster_indices = cluster_rows.index
|
385 |
+
text_lengths = df_original.iloc[cluster_indices][text_column].str.len()
|
386 |
+
longest_text_idx = cluster_indices[text_lengths.argmax()]
|
387 |
+
indices_to_keep.append(longest_text_idx)
|
388 |
+
else:
|
389 |
+
# For single-item clusters, keep the item
|
390 |
+
indices_to_keep.extend(result_df[result_df['cluster_id'] == cluster_id].index)
|
391 |
|
392 |
+
# Add all non-clustered rows (cluster_id = 0)
|
393 |
+
non_clustered_indices = df_original.index[~df_original.index.isin(result_df.index)]
|
394 |
indices_to_keep.extend(non_clustered_indices)
|
395 |
|
396 |
# Create final declustered DataFrame
|
397 |
+
declustered_df = df_original.iloc[sorted(indices_to_keep)].copy()
|
398 |
|
399 |
+
st.success(f"""
|
400 |
+
Processing results:
|
401 |
+
- Original rows: {len(df_original)}
|
402 |
+
- Rows in clusters: {len(result_df)}
|
403 |
+
- Multi-item clusters: {len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique())}
|
404 |
+
- Rows kept after declustering: {len(declustered_df)}
|
405 |
+
""")
|
406 |
|
407 |
# Download buttons for all results
|
408 |
st.subheader("Download Results")
|
|
|
427 |
|
428 |
except Exception as e:
|
429 |
st.error(f"Error: {str(e)}")
|
|
|
430 |
import traceback
|
431 |
st.error(traceback.format_exc())
|
432 |
finally:
|