Spaces:

pentarosarium
/

clusters

Sleeping

App Files Files Community

pentarosarium commited on Nov 29, 2024

Commit

8b2771c

1 Parent(s): 6f609e4

1.13

Browse files

Files changed (1) hide show

app.py +18 -21

app.py CHANGED Viewed

@@ -321,7 +321,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
     return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
 def main():
-    st.title("кластеризуем новости v.1.12")
     st.write("Upload Excel file with columns: company, datetime, text")
     uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
@@ -362,45 +362,42 @@ def main():
                     st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
                     # Preserve all columns from original DataFrame in dedup_df
-                    dedup_df = df_original.loc[dedup_df.index].copy()
-                    # Create working copy of dedup_df with required columns for clustering
-                    working_df = dedup_df[[company_column, datetime_column, title_column, text_column]].copy()
-                    working_df.columns = ['company', 'datetime', 'title', 'text']
                     # Step 2: Cluster deduplicated news
                     processor = NewsProcessor(similarity_threshold, time_threshold)
-                    result_df = processor.process_news(working_df, progress_bar)
-                    # Initialize list of indices to keep
-                    indices_to_keep = []
                     # Process each cluster
                     for cluster_id in result_df['cluster_id'].unique():
-                        cluster_size = len(result_df[result_df['cluster_id'] == cluster_id])
                         if cluster_size > 1:
                             # For clusters with multiple items, keep only the one with longest text
-                            cluster_rows = result_df[result_df['cluster_id'] == cluster_id]
-                            cluster_indices = cluster_rows.index
-                            text_lengths = dedup_df.iloc[cluster_indices][text_column].str.len()
-                            longest_text_idx = cluster_indices[text_lengths.argmax()]
-                            indices_to_keep.append(longest_text_idx)
                         else:
                             # For single-item clusters, keep the item
-                            indices_to_keep.extend(result_df[result_df['cluster_id'] == cluster_id].index)
                     # Add all non-clustered rows from dedup_df
-                    non_clustered_indices = dedup_df.index[~dedup_df.index.isin(result_df.index)]
-                    indices_to_keep.extend(non_clustered_indices)
                     # Create final declustered DataFrame from dedup_df
-                    declustered_df = dedup_df.iloc[sorted(indices_to_keep)].copy()
                     st.success(f"""
                         Processing results:
                         - Original rows: {len(df_original)}
-                        - After deduplication: {len(dedup_df)}
                         - Rows in clusters: {len(result_df)}
                         - Multi-item clusters: {len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique())}
                         - Final rows after declustering: {len(declustered_df)}
@@ -408,7 +405,7 @@ def main():
                     # Download buttons for all results
                     st.subheader("Download Results")
-                    st.markdown(create_download_link(dedup_df, "deduplicated_news.xlsx"), unsafe_allow_html=True)
                     st.markdown(create_download_link(result_df, "clustered_news.xlsx"), unsafe_allow_html=True)
                     st.markdown(create_download_link(declustered_df, "declustered_news.xlsx"), unsafe_allow_html=True)

     return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
 def main():
+    st.title("кластеризуем новости v.1.13")
     st.write("Upload Excel file with columns: company, datetime, text")
     uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
                     st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
                     # Preserve all columns from original DataFrame in dedup_df
+                    dedup_df_full = df_original.loc[dedup_df.index].copy()
                     # Step 2: Cluster deduplicated news
                     processor = NewsProcessor(similarity_threshold, time_threshold)
+                    result_df = processor.process_news(dedup_df, progress_bar)
+                    # Initialize set of indices to keep
+                    indices_to_keep = set()
                     # Process each cluster
                     for cluster_id in result_df['cluster_id'].unique():
+                        cluster_mask = result_df['cluster_id'] == cluster_id
+                        cluster_size = cluster_mask.sum()
                         if cluster_size > 1:
                             # For clusters with multiple items, keep only the one with longest text
+                            cluster_indices = result_df[cluster_mask].index
+                            text_lengths = dedup_df_full.loc[cluster_indices, text_column].str.len()
+                            longest_text_idx = text_lengths.idxmax()
+                            indices_to_keep.add(longest_text_idx)
                         else:
                             # For single-item clusters, keep the item
+                            indices_to_keep.update(result_df[cluster_mask].index)
                     # Add all non-clustered rows from dedup_df
+                    clustered_indices = set(result_df.index)
+                    non_clustered_indices = set(dedup_df_full.index) - clustered_indices
+                    indices_to_keep.update(non_clustered_indices)
                     # Create final declustered DataFrame from dedup_df
+                    declustered_df = dedup_df_full.loc[list(indices_to_keep)].copy()
                     st.success(f"""
                         Processing results:
                         - Original rows: {len(df_original)}
+                        - After deduplication: {len(dedup_df_full)}
                         - Rows in clusters: {len(result_df)}
                         - Multi-item clusters: {len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique())}
                         - Final rows after declustering: {len(declustered_df)}
                     # Download buttons for all results
                     st.subheader("Download Results")
+                    st.markdown(create_download_link(dedup_df_full, "deduplicated_news.xlsx"), unsafe_allow_html=True)
                     st.markdown(create_download_link(result_df, "clustered_news.xlsx"), unsafe_allow_html=True)
                     st.markdown(create_download_link(declustered_df, "declustered_news.xlsx"), unsafe_allow_html=True)