Spaces:

pentarosarium
/

clusters

Sleeping

App Files Files Community

pentarosarium commited on Nov 29, 2024

Commit

21d6a34

1 Parent(s): 8a76f7a

1.11

Browse files

Files changed (1) hide show

app.py +36 -23

app.py CHANGED Viewed

@@ -321,24 +321,23 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
     return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
 def main():
-    st.title("кластеризуем новости v.1.10")
     st.write("Upload Excel file with columns: company, datetime, text")
     uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
     if uploaded_file:
         try:
-            # First, let's look at the columns in the file
             df_original = pd.read_excel(uploaded_file, sheet_name='Публикации')
             st.write("Available columns:", df_original.columns.tolist())
             # Create working copy with required columns
             df = df_original.copy()
-            # Assuming the order is fixed in the Excel file, adjust indices if needed
-            text_column = df_original.columns[6]  # Adjust if needed
-            title_column = df_original.columns[5]  # Adjust if needed
-            datetime_column = df_original.columns[3]  # Adjust if needed
-            company_column = df_original.columns[0]  # Adjust if needed
             df = df_original[[company_column, datetime_column, title_column, text_column]].copy()
             df.columns = ['company', 'datetime', 'title', 'text']
@@ -366,29 +365,44 @@ def main():
                     processor = NewsProcessor(similarity_threshold, time_threshold)
                     result_df = processor.process_news(dedup_df, progress_bar)
-                    st.success(f"Found {result_df['cluster_id'].nunique()} clusters")
-                    # Process clusters to keep only longest text from each cluster
-                    clusters_to_process = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
                     indices_to_keep = []
-                    for cluster_id in clusters_to_process:
-                        cluster_rows = result_df[result_df['cluster_id'] == cluster_id]
-                        # Get original indices from cluster
-                        cluster_indices = cluster_rows.index
-                        # Find the index with longest text
-                        text_lengths = df_original.iloc[cluster_indices][text_column].str.len()
-                        longest_text_idx = cluster_indices[text_lengths.argmax()]
-                        indices_to_keep.append(longest_text_idx)
-                    # Add indices of rows that weren't in any cluster or were in single-row clusters
-                    non_clustered_indices = result_df[~result_df['cluster_id'].isin(clusters_to_process)].index
                     indices_to_keep.extend(non_clustered_indices)
                     # Create final declustered DataFrame
-                    declustered_df = df_original.iloc[indices_to_keep].copy()
-                    st.success(f"Kept {len(declustered_df)} news items after removing cluster duplicates")
                     # Download buttons for all results
                     st.subheader("Download Results")
@@ -413,7 +427,6 @@ def main():
                 except Exception as e:
                     st.error(f"Error: {str(e)}")
-                    # Print more detailed error information
                     import traceback
                     st.error(traceback.format_exc())
                 finally:

     return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
 def main():
+    st.title("кластеризуем новости v.1.11")
     st.write("Upload Excel file with columns: company, datetime, text")
     uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
     if uploaded_file:
         try:
+            # Read all columns from original sheet
             df_original = pd.read_excel(uploaded_file, sheet_name='Публикации')
             st.write("Available columns:", df_original.columns.tolist())
             # Create working copy with required columns
             df = df_original.copy()
+            text_column = df_original.columns[6]
+            title_column = df_original.columns[5]
+            datetime_column = df_original.columns[3]
+            company_column = df_original.columns[0]
             df = df_original[[company_column, datetime_column, title_column, text_column]].copy()
             df.columns = ['company', 'datetime', 'title', 'text']
                     processor = NewsProcessor(similarity_threshold, time_threshold)
                     result_df = processor.process_news(dedup_df, progress_bar)
+                    # Create a mapping between original indices and cluster information
+                    index_to_cluster = pd.Series(0, index=df_original.index)  # Default cluster 0 for non-clustered rows
+                    for idx, row in result_df.iterrows():
+                        index_to_cluster[idx] = row['cluster_id']
+                    # Initialize list of indices to keep
                     indices_to_keep = []
+                    # Process each cluster
+                    for cluster_id in result_df['cluster_id'].unique():
+                        cluster_size = len(result_df[result_df['cluster_id'] == cluster_id])
+                        if cluster_size > 1:
+                            # For clusters with multiple items, keep only the one with longest text
+                            cluster_rows = result_df[result_df['cluster_id'] == cluster_id]
+                            cluster_indices = cluster_rows.index
+                            text_lengths = df_original.iloc[cluster_indices][text_column].str.len()
+                            longest_text_idx = cluster_indices[text_lengths.argmax()]
+                            indices_to_keep.append(longest_text_idx)
+                        else:
+                            # For single-item clusters, keep the item
+                            indices_to_keep.extend(result_df[result_df['cluster_id'] == cluster_id].index)
+                    # Add all non-clustered rows (cluster_id = 0)
+                    non_clustered_indices = df_original.index[~df_original.index.isin(result_df.index)]
                     indices_to_keep.extend(non_clustered_indices)
                     # Create final declustered DataFrame
+                    declustered_df = df_original.iloc[sorted(indices_to_keep)].copy()
+                    st.success(f"""
+                        Processing results:
+                        - Original rows: {len(df_original)}
+                        - Rows in clusters: {len(result_df)}
+                        - Multi-item clusters: {len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique())}
+                        - Rows kept after declustering: {len(declustered_df)}
+                    """)
                     # Download buttons for all results
                     st.subheader("Download Results")
                 except Exception as e:
                     st.error(f"Error: {str(e)}")
                     import traceback
                     st.error(traceback.format_exc())
                 finally: