Spaces:
Sleeping
Sleeping
Commit
·
25558c7
1
Parent(s):
67890fd
1.23 print debug
Browse files
app.py
CHANGED
@@ -301,7 +301,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
|
|
301 |
|
302 |
|
303 |
def main():
|
304 |
-
st.title("кластеризуем новости v.1.
|
305 |
st.write("Upload Excel file with columns: company, datetime, text")
|
306 |
|
307 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
@@ -339,6 +339,10 @@ def main():
|
|
339 |
# Step 1: Deduplicate
|
340 |
deduplicator = NewsDeduplicator(fuzzy_threshold)
|
341 |
dedup_df = deduplicator.deduplicate(df, progress_bar)
|
|
|
|
|
|
|
|
|
342 |
st.write("\nDeduplication Results:")
|
343 |
st.write(f"Original indices: {df.index.tolist()}")
|
344 |
st.write(f"Dedup indices: {dedup_df.index.tolist()}")
|
@@ -382,19 +386,18 @@ def main():
|
|
382 |
st.write(f"Indices to delete: {sorted(list(indices_to_delete))}")
|
383 |
|
384 |
# Create final DataFrame
|
385 |
-
declustered_df =
|
386 |
if indices_to_delete:
|
387 |
declustered_df = declustered_df.drop(index=list(indices_to_delete))
|
388 |
|
389 |
st.write(f"Final indices kept: {sorted(declustered_df.index.tolist())}")
|
390 |
-
|
391 |
|
392 |
# Print statistics
|
393 |
st.success(f"""
|
394 |
Processing results:
|
395 |
- Original rows: {len(df_original)}
|
396 |
- After deduplication: {len(dedup_df_full)}
|
397 |
-
- Multi-item clusters found: {len(
|
398 |
- Rows removed from clusters: {len(indices_to_delete)}
|
399 |
- Final rows kept: {len(declustered_df)}
|
400 |
""")
|
|
|
301 |
|
302 |
|
303 |
def main():
|
304 |
+
st.title("кластеризуем новости v.1.23 + print debug")
|
305 |
st.write("Upload Excel file with columns: company, datetime, text")
|
306 |
|
307 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
|
|
339 |
# Step 1: Deduplicate
|
340 |
deduplicator = NewsDeduplicator(fuzzy_threshold)
|
341 |
dedup_df = deduplicator.deduplicate(df, progress_bar)
|
342 |
+
|
343 |
+
# Preserve all columns from original DataFrame in dedup_df_full
|
344 |
+
dedup_df_full = df_original.loc[dedup_df.index].copy()
|
345 |
+
|
346 |
st.write("\nDeduplication Results:")
|
347 |
st.write(f"Original indices: {df.index.tolist()}")
|
348 |
st.write(f"Dedup indices: {dedup_df.index.tolist()}")
|
|
|
386 |
st.write(f"Indices to delete: {sorted(list(indices_to_delete))}")
|
387 |
|
388 |
# Create final DataFrame
|
389 |
+
declustered_df = dedup_df_full.copy()
|
390 |
if indices_to_delete:
|
391 |
declustered_df = declustered_df.drop(index=list(indices_to_delete))
|
392 |
|
393 |
st.write(f"Final indices kept: {sorted(declustered_df.index.tolist())}")
|
|
|
394 |
|
395 |
# Print statistics
|
396 |
st.success(f"""
|
397 |
Processing results:
|
398 |
- Original rows: {len(df_original)}
|
399 |
- After deduplication: {len(dedup_df_full)}
|
400 |
+
- Multi-item clusters found: {len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()) if len(result_df) > 0 else 0}
|
401 |
- Rows removed from clusters: {len(indices_to_delete)}
|
402 |
- Final rows kept: {len(declustered_df)}
|
403 |
""")
|