Spaces:
Sleeping
Sleeping
Commit
·
67890fd
1
Parent(s):
b3c96bc
1.22 print debug
Browse files
app.py
CHANGED
@@ -186,51 +186,30 @@ class NewsProcessor:
|
|
186 |
if df.empty:
|
187 |
return pd.DataFrame(columns=['cluster_id', 'datetime', 'company', 'relevance_score', 'text', 'cluster_size'])
|
188 |
|
189 |
-
df = df.
|
190 |
|
191 |
-
# First, filter out news where the company isn't the main subject
|
192 |
-
relevance_results = []
|
193 |
-
for idx, row in df.iterrows():
|
194 |
-
title = row['title'] if 'title' in row else ''
|
195 |
-
is_main, score = self.is_company_main_subject(title, row['text'], row['company'])
|
196 |
-
if is_main:
|
197 |
-
relevance_results.append({
|
198 |
-
'idx': idx,
|
199 |
-
'relevance_score': score
|
200 |
-
})
|
201 |
-
|
202 |
-
if not relevance_results:
|
203 |
-
return pd.DataFrame(columns=['cluster_id', 'datetime', 'company', 'relevance_score', 'text', 'cluster_size'])
|
204 |
-
|
205 |
-
relevant_indices = [r['idx'] for r in relevance_results]
|
206 |
-
relevance_scores = {r['idx']: r['relevance_score'] for r in relevance_results}
|
207 |
-
|
208 |
-
df_filtered = df.loc[relevant_indices].copy()
|
209 |
-
df_filtered['relevance_score'] = df_filtered.index.map(relevance_scores)
|
210 |
-
|
211 |
-
# Continue with clustering logic...
|
212 |
clusters = []
|
213 |
processed = set()
|
214 |
|
215 |
-
for
|
216 |
-
if
|
217 |
continue
|
218 |
|
219 |
-
row1 =
|
220 |
-
cluster = [
|
221 |
-
processed.add(
|
222 |
|
223 |
if not pd.isna(row1['text']):
|
224 |
text1_embedding = self.encode_text(row1['text'])
|
225 |
|
226 |
if progress_bar:
|
227 |
-
progress_bar.progress(len(processed) / len(
|
228 |
|
229 |
-
for
|
230 |
-
if
|
231 |
continue
|
232 |
|
233 |
-
row2 =
|
234 |
if pd.isna(row2['text']):
|
235 |
continue
|
236 |
|
@@ -242,12 +221,12 @@ class NewsProcessor:
|
|
242 |
similarity = np.dot(text1_embedding, text2_embedding)
|
243 |
|
244 |
if similarity >= self.similarity_threshold:
|
245 |
-
cluster.append(
|
246 |
-
processed.add(
|
247 |
|
248 |
clusters.append(cluster)
|
249 |
|
250 |
-
# Create result DataFrame
|
251 |
result_data = []
|
252 |
for cluster_id, cluster_indices in enumerate(clusters, 1):
|
253 |
cluster_rows = df.loc[cluster_indices]
|
@@ -256,12 +235,12 @@ class NewsProcessor:
|
|
256 |
'cluster_id': cluster_id,
|
257 |
'datetime': df.loc[idx, 'datetime'],
|
258 |
'company': df.loc[idx, 'company'],
|
259 |
-
'relevance_score': relevance_scores[idx],
|
260 |
'text': df.loc[idx, 'text'],
|
261 |
'cluster_size': len(cluster_indices)
|
262 |
})
|
263 |
|
264 |
-
|
|
|
265 |
|
266 |
class NewsDeduplicator:
|
267 |
def __init__(self, fuzzy_threshold=85):
|
@@ -322,7 +301,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
|
|
322 |
|
323 |
|
324 |
def main():
|
325 |
-
st.title("кластеризуем новости v.1.
|
326 |
st.write("Upload Excel file with columns: company, datetime, text")
|
327 |
|
328 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
@@ -360,61 +339,54 @@ def main():
|
|
360 |
# Step 1: Deduplicate
|
361 |
deduplicator = NewsDeduplicator(fuzzy_threshold)
|
362 |
dedup_df = deduplicator.deduplicate(df, progress_bar)
|
363 |
-
st.write("\
|
364 |
-
st.write(f"
|
365 |
-
st.
|
366 |
-
|
367 |
-
|
368 |
-
dedup_df_full = df_original.loc[dedup_df.index].copy()
|
369 |
-
st.write(f"dedup_df_full indices: {dedup_df_full.index.tolist()}")
|
370 |
|
371 |
# Step 2: Cluster deduplicated news
|
372 |
processor = NewsProcessor(similarity_threshold, time_threshold)
|
373 |
result_df = processor.process_news(dedup_df, progress_bar)
|
374 |
-
st.write("\
|
375 |
-
st.write(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
376 |
|
377 |
-
#
|
378 |
indices_to_delete = set()
|
379 |
|
380 |
-
# Find rows to delete from multi-item clusters
|
381 |
if len(result_df) > 0:
|
382 |
-
|
383 |
-
multi_clusters = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
|
384 |
-
st.write(f"\nMulti-clusters found: {multi_clusters.tolist()}")
|
385 |
-
|
386 |
-
# For each multi-item cluster
|
387 |
-
for cluster_id in multi_clusters:
|
388 |
-
st.write(f"\nProcessing cluster {cluster_id}:")
|
389 |
-
# Get rows in this cluster
|
390 |
cluster_mask = result_df['cluster_id'] == cluster_id
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
#original_indices = dedup_df_full.index[cluster_rows.index - 1] -it was wrong!
|
398 |
-
st.write(f"Original indices: {original_indices.tolist()}")
|
399 |
-
|
400 |
-
# Find the row with longest text among these indices
|
401 |
-
text_lengths = dedup_df_full.loc[original_indices, text_column].fillna('').str.len()
|
402 |
-
st.write(f"Text lengths: {text_lengths.to_dict()}")
|
403 |
-
longest_text_idx = text_lengths.idxmax()
|
404 |
-
st.write(f"Longest text index: {longest_text_idx}")
|
405 |
-
|
406 |
-
# Add all other indices to delete set
|
407 |
-
new_indices_to_delete = set(original_indices) - {longest_text_idx}
|
408 |
-
indices_to_delete.update(new_indices_to_delete)
|
409 |
-
st.write(f"Indices to delete from this cluster: {new_indices_to_delete}")
|
410 |
|
411 |
-
st.write(
|
|
|
412 |
|
413 |
-
# Create final
|
414 |
-
declustered_df =
|
415 |
if indices_to_delete:
|
416 |
declustered_df = declustered_df.drop(index=list(indices_to_delete))
|
417 |
-
|
|
|
418 |
|
419 |
|
420 |
# Print statistics
|
|
|
186 |
if df.empty:
|
187 |
return pd.DataFrame(columns=['cluster_id', 'datetime', 'company', 'relevance_score', 'text', 'cluster_size'])
|
188 |
|
189 |
+
df = df.copy() # Make a copy to preserve original indices
|
190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
clusters = []
|
192 |
processed = set()
|
193 |
|
194 |
+
for idx in df.index: # Iterate over original indices
|
195 |
+
if idx in processed:
|
196 |
continue
|
197 |
|
198 |
+
row1 = df.loc[idx]
|
199 |
+
cluster = [idx] # Store original index
|
200 |
+
processed.add(idx)
|
201 |
|
202 |
if not pd.isna(row1['text']):
|
203 |
text1_embedding = self.encode_text(row1['text'])
|
204 |
|
205 |
if progress_bar:
|
206 |
+
progress_bar.progress(len(processed) / len(df))
|
207 |
|
208 |
+
for other_idx in df.index: # Iterate over original indices
|
209 |
+
if other_idx in processed:
|
210 |
continue
|
211 |
|
212 |
+
row2 = df.loc[other_idx]
|
213 |
if pd.isna(row2['text']):
|
214 |
continue
|
215 |
|
|
|
221 |
similarity = np.dot(text1_embedding, text2_embedding)
|
222 |
|
223 |
if similarity >= self.similarity_threshold:
|
224 |
+
cluster.append(other_idx)
|
225 |
+
processed.add(other_idx)
|
226 |
|
227 |
clusters.append(cluster)
|
228 |
|
229 |
+
# Create result DataFrame preserving original indices
|
230 |
result_data = []
|
231 |
for cluster_id, cluster_indices in enumerate(clusters, 1):
|
232 |
cluster_rows = df.loc[cluster_indices]
|
|
|
235 |
'cluster_id': cluster_id,
|
236 |
'datetime': df.loc[idx, 'datetime'],
|
237 |
'company': df.loc[idx, 'company'],
|
|
|
238 |
'text': df.loc[idx, 'text'],
|
239 |
'cluster_size': len(cluster_indices)
|
240 |
})
|
241 |
|
242 |
+
result_df = pd.DataFrame(result_data, index=sum(clusters, [])) # Use original indices
|
243 |
+
return result_df
|
244 |
|
245 |
class NewsDeduplicator:
|
246 |
def __init__(self, fuzzy_threshold=85):
|
|
|
301 |
|
302 |
|
303 |
def main():
|
304 |
+
st.title("кластеризуем новости v.1.22 + print debug")
|
305 |
st.write("Upload Excel file with columns: company, datetime, text")
|
306 |
|
307 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
|
|
339 |
# Step 1: Deduplicate
|
340 |
deduplicator = NewsDeduplicator(fuzzy_threshold)
|
341 |
dedup_df = deduplicator.deduplicate(df, progress_bar)
|
342 |
+
st.write("\nDeduplication Results:")
|
343 |
+
st.write(f"Original indices: {df.index.tolist()}")
|
344 |
+
st.write(f"Dedup indices: {dedup_df.index.tolist()}")
|
345 |
+
st.write(f"Sample from dedup_df:")
|
346 |
+
st.write(dedup_df[['company', 'text']].head())
|
|
|
|
|
347 |
|
348 |
# Step 2: Cluster deduplicated news
|
349 |
processor = NewsProcessor(similarity_threshold, time_threshold)
|
350 |
result_df = processor.process_news(dedup_df, progress_bar)
|
351 |
+
st.write("\nClustering Results:")
|
352 |
+
st.write(f"Result df indices: {result_df.index.tolist()}")
|
353 |
+
|
354 |
+
# Display cluster information
|
355 |
+
if len(result_df) > 0:
|
356 |
+
st.write("\nCluster Details:")
|
357 |
+
for cluster_id in result_df['cluster_id'].unique():
|
358 |
+
cluster_mask = result_df['cluster_id'] == cluster_id
|
359 |
+
if sum(cluster_mask) > 1: # Only show multi-item clusters
|
360 |
+
cluster_indices = result_df[cluster_mask].index.tolist()
|
361 |
+
st.write(f"\nCluster {cluster_id}:")
|
362 |
+
st.write(f"Indices: {cluster_indices}")
|
363 |
+
# Show texts for verification
|
364 |
+
for idx in cluster_indices:
|
365 |
+
text_length = len(str(dedup_df.loc[idx, 'text']))
|
366 |
+
st.write(f"Index {idx} - Length {text_length}:")
|
367 |
+
st.write(str(dedup_df.loc[idx, 'text'])[:100] + '...')
|
368 |
|
369 |
+
# Process clusters for deletion
|
370 |
indices_to_delete = set()
|
371 |
|
|
|
372 |
if len(result_df) > 0:
|
373 |
+
for cluster_id in result_df['cluster_id'].unique():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
374 |
cluster_mask = result_df['cluster_id'] == cluster_id
|
375 |
+
if sum(cluster_mask) > 1:
|
376 |
+
cluster_indices = result_df[cluster_mask].index.tolist()
|
377 |
+
text_lengths = dedup_df.loc[cluster_indices, 'text'].fillna('').str.len()
|
378 |
+
longest_text_idx = text_lengths.idxmax()
|
379 |
+
indices_to_delete.update(set(cluster_indices) - {longest_text_idx})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
380 |
|
381 |
+
st.write("\nDeletion Summary:")
|
382 |
+
st.write(f"Indices to delete: {sorted(list(indices_to_delete))}")
|
383 |
|
384 |
+
# Create final DataFrame
|
385 |
+
declustered_df = dedup_df.copy()
|
386 |
if indices_to_delete:
|
387 |
declustered_df = declustered_df.drop(index=list(indices_to_delete))
|
388 |
+
|
389 |
+
st.write(f"Final indices kept: {sorted(declustered_df.index.tolist())}")
|
390 |
|
391 |
|
392 |
# Print statistics
|