pentarosarium commited on
Commit
6c1fc10
·
1 Parent(s): 8b0d8c1

complete revamp

Browse files
Files changed (1) hide show
  1. app.py +150 -741
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import gradio as gr
2
- import spaces
3
  import pandas as pd
4
  import torch
5
  from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, AutoModel
@@ -9,37 +8,13 @@ import io
9
  from rapidfuzz import fuzz
10
  import time
11
  import os
12
- groq_key = os.environ['groq_key']
13
- from langchain_openai import ChatOpenAI
14
- from langchain.prompts import PromptTemplate
15
- from openpyxl import load_workbook
16
- from openpyxl.utils.dataframe import dataframe_to_rows
17
- import torch.nn.functional as F
18
- import numpy as np
19
- import logging
20
  from typing import List, Set, Tuple
 
21
 
22
- def fuzzy_deduplicate(df, column, threshold=55):
23
- """Deduplicate rows based on fuzzy matching of text content"""
24
- seen_texts = []
25
- indices_to_keep = []
26
-
27
- for i, text in enumerate(df[column]):
28
- if pd.isna(text):
29
- indices_to_keep.append(i)
30
- continue
31
-
32
- text = str(text)
33
- if not seen_texts or all(fuzz.ratio(text, seen) < threshold for seen in seen_texts):
34
- seen_texts.append(text)
35
- indices_to_keep.append(i)
36
-
37
- return df.iloc[indices_to_keep]
38
-
39
  logging.basicConfig(level=logging.INFO)
40
  logger = logging.getLogger(__name__)
41
 
42
-
43
  class GPUTaskManager:
44
  def __init__(self, max_retries=3, retry_delay=30, cleanup_callback=None):
45
  self.max_retries = max_retries
@@ -47,12 +22,11 @@ class GPUTaskManager:
47
  self.cleanup_callback = cleanup_callback
48
 
49
  async def run_with_retry(self, task_func, *args, **kwargs):
50
- """Execute a GPU task with retry logic"""
51
  for attempt in range(self.max_retries):
52
  try:
53
  return await task_func(*args, **kwargs)
54
  except Exception as e:
55
- if "GPU task aborted" in str(e) or "GPU quota" in str(e):
56
  if attempt < self.max_retries - 1:
57
  if self.cleanup_callback:
58
  self.cleanup_callback()
@@ -63,34 +37,7 @@ class GPUTaskManager:
63
 
64
  @staticmethod
65
  def batch_process(items, batch_size=3):
66
- """Split items into smaller batches"""
67
  return [items[i:i + batch_size] for i in range(0, len(items), batch_size)]
68
-
69
- @staticmethod
70
- def is_gpu_error(error):
71
- """Check if an error is GPU-related"""
72
- error_msg = str(error).lower()
73
- return any(msg in error_msg for msg in [
74
- "gpu task aborted",
75
- "gpu quota",
76
- "cuda out of memory",
77
- "device-side assert"
78
- ])
79
-
80
-
81
-
82
- class ProcessControl:
83
- def __init__(self):
84
- self.stop_requested = False
85
-
86
- def request_stop(self):
87
- self.stop_requested = True
88
-
89
- def should_stop(self):
90
- return self.stop_requested
91
-
92
- def reset(self):
93
- self.stop_requested = False
94
 
95
  class ProcessControl:
96
  def __init__(self):
@@ -117,14 +64,13 @@ class EventDetector:
117
  device = "cuda" if torch.cuda.is_available() else "cpu"
118
  logger.info(f"Initializing models on device: {device}")
119
 
120
- # Initialize all models
121
- self.initialize_models(device)
122
 
123
  # Initialize transformer for declusterization
124
  self.tokenizer_cluster = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
125
  self.model_cluster = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2').to(device)
126
 
127
- self.device = device
128
  self.initialized = True
129
  logger.info("All models initialized successfully")
130
 
@@ -132,146 +78,80 @@ class EventDetector:
132
  logger.error(f"Error in EventDetector initialization: {str(e)}")
133
  raise
134
 
135
- def mean_pooling(self, model_output, attention_mask):
136
- token_embeddings = model_output[0]
137
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
138
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
139
-
140
- def encode_text(self, text):
141
- if pd.isna(text):
142
- text = ""
143
- text = str(text)
144
-
145
- encoded_input = self.tokenizer_cluster(text, padding=True, truncation=True, max_length=512, return_tensors='pt').to(self.device)
146
- with torch.no_grad():
147
- model_output = self.model_cluster(**encoded_input)
148
- sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
149
- return torch.nn.functional.normalize(sentence_embeddings[0], p=2, dim=0)
150
-
151
- @spaces.GPU(duration=20)
152
- def decluster_texts(self, df, text_column, similarity_threshold=0.75, time_threshold=24):
153
  try:
154
- if df.empty:
155
- return df
 
 
 
 
156
 
157
- # Sort by datetime if available
158
- if 'datetime' in df.columns:
159
- df = df.sort_values('datetime')
 
 
 
 
 
 
 
 
 
 
 
160
 
161
- # Initialize lists and sets for tracking
162
- indices_to_delete = set()
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
- # Process each text
165
- for i in df.index:
166
- if i in indices_to_delete: # Skip if already marked for deletion
167
- continue
168
-
169
- text1 = df.loc[i, text_column]
170
- if pd.isna(text1):
171
- continue
172
-
173
- text1_embedding = self.encode_text(text1)
174
- current_cluster = []
175
-
176
- # Compare with other texts
177
- for j in df.index:
178
- if i == j or j in indices_to_delete: # Skip same text or already marked
179
- continue
180
-
181
- text2 = df.loc[j, text_column]
182
- if pd.isna(text2):
183
- continue
184
-
185
- # Check time difference if datetime available
186
- if 'datetime' in df.columns:
187
- time_diff = pd.to_datetime(df.loc[j, 'datetime']) - pd.to_datetime(df.loc[i, 'datetime'])
188
- if abs(time_diff.total_seconds() / 3600) > time_threshold:
189
- continue
190
-
191
- text2_embedding = self.encode_text(text2)
192
- similarity = torch.dot(text1_embedding, text2_embedding).item()
193
-
194
- if similarity >= similarity_threshold:
195
- current_cluster.append(j)
196
-
197
- # If we found similar texts, keep the longest one
198
- if current_cluster:
199
- current_cluster.append(i) # Add the current text to cluster
200
- text_lengths = df.loc[current_cluster, text_column].fillna('').str.len()
201
- longest_text_idx = text_lengths.idxmax()
202
-
203
- # Mark all except longest for deletion
204
- indices_to_delete.update(set(current_cluster) - {longest_text_idx})
205
 
206
- # Return DataFrame without deleted rows
207
- return df.drop(index=list(indices_to_delete))
 
 
 
 
 
 
208
 
209
  except Exception as e:
210
- logger.error(f"Declusterization error: {str(e)}")
211
- return df
212
-
213
- @spaces.GPU(duration=30)
214
- def initialize_models(self, device):
215
- """Initialize all models with GPU support"""
216
- # Initialize translation model
217
- self.translator = pipeline(
218
- "translation",
219
- model="Helsinki-NLP/opus-mt-ru-en",
220
- device=device
221
- )
222
-
223
- self.rutranslator = pipeline(
224
- "translation",
225
- model="Helsinki-NLP/opus-mt-en-ru",
226
- device=device
227
- )
228
-
229
- # Initialize sentiment models
230
- self.finbert = pipeline(
231
- "sentiment-analysis",
232
- model="ProsusAI/finbert",
233
- device=device,
234
- truncation=True,
235
- max_length=512
236
- )
237
- self.roberta = pipeline(
238
- "sentiment-analysis",
239
- model="cardiffnlp/twitter-roberta-base-sentiment",
240
- device=device,
241
- truncation=True,
242
- max_length=512
243
- )
244
- self.finbert_tone = pipeline(
245
- "sentiment-analysis",
246
- model="yiyanghkust/finbert-tone",
247
- device=device,
248
- truncation=True,
249
- max_length=512
250
- )
251
-
252
- # Initialize MT5 model
253
- self.model_name = "google/mt5-small"
254
- self.tokenizer = AutoTokenizer.from_pretrained(
255
- self.model_name,
256
- legacy=True
257
- )
258
- self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name).to(device)
259
-
260
- # Initialize Groq
261
- if 'groq_key':
262
- self.groq = ChatOpenAI(
263
- base_url="https://api.groq.com/openai/v1",
264
- model="llama-3.1-70b-versatile",
265
- openai_api_key=groq_key,
266
- temperature=0.0
267
- )
268
- else:
269
- logger.warning("Groq API key not found, impact estimation will be limited")
270
- self.groq = None
271
 
272
- @spaces.GPU(duration=20)
273
  def _translate_text(self, text):
274
- """Translate Russian text to English"""
275
  try:
276
  if not text or not isinstance(text, str):
277
  return ""
@@ -280,7 +160,6 @@ class EventDetector:
280
  if not text:
281
  return ""
282
 
283
- # Split into manageable chunks
284
  max_length = 450
285
  chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
286
  translated_chunks = []
@@ -288,7 +167,7 @@ class EventDetector:
288
  for chunk in chunks:
289
  result = self.translator(chunk)[0]['translation_text']
290
  translated_chunks.append(result)
291
- time.sleep(0.1) # Rate limiting
292
 
293
  return " ".join(translated_chunks)
294
 
@@ -296,179 +175,48 @@ class EventDetector:
296
  logger.error(f"Translation error: {str(e)}")
297
  return text
298
 
299
- @spaces.GPU(duration=20)
300
  def analyze_sentiment(self, text):
301
- """Enhanced sentiment analysis with better negative detection"""
302
  try:
303
- if not text or not isinstance(text, str):
304
  return "Neutral"
305
 
306
- text = text.strip()
307
- if not text:
308
- return "Neutral"
309
-
310
- # Get predictions with confidence scores
311
  finbert_result = self.finbert(text)[0]
312
  roberta_result = self.roberta(text)[0]
313
- finbert_tone_result = self.finbert_tone(text)[0]
314
 
315
- # Enhanced sentiment mapping with confidence thresholds
316
- def map_sentiment(result):
 
317
  label = result['label'].lower()
318
- score = result['score']
319
-
320
- # Higher threshold for positive to reduce false positives
321
- if label in ['positive', 'pos', 'positive tone'] and score > 0.75:
322
- return "Positive"
323
- # Lower threshold for negative to catch more cases
324
- elif label in ['negative', 'neg', 'negative tone'] and score > 0.75:
325
- return "Negative"
326
- # Consider high-confidence neutral predictions
327
- elif label == 'neutral' and score > 0.8:
328
- return "Neutral"
329
- # Default to negative for uncertain cases in financial context
330
  else:
331
- return "Negative" if score > 0.4 else "Neutral"
332
-
333
- # Get mapped sentiments with confidence-based logic
334
- sentiments = [
335
- map_sentiment(finbert_result),
336
- map_sentiment(roberta_result),
337
- map_sentiment(finbert_tone_result)
338
- ]
339
 
340
- # Weighted voting - prioritize negative signals
341
- if "Negative" in sentiments:
342
- neg_count = sentiments.count("Negative")
343
- if neg_count >= 2: # negative should be consensus
344
- return "Negative"
345
-
346
  pos_count = sentiments.count("Positive")
347
- if pos_count >= 2: # Require stronger positive consensus
 
 
 
 
348
  return "Positive"
349
-
350
  return "Neutral"
351
 
352
  except Exception as e:
353
  logger.error(f"Sentiment analysis error: {str(e)}")
354
  return "Neutral"
355
 
356
- def estimate_impact(self, text, entity):
357
- """Estimate impact using Groq for negative sentiment texts"""
358
- try:
359
- if not self.groq:
360
- return "Неопределенный эффект", "Groq API недоступен"
361
-
362
- template = """
363
- You are a financial analyst. Analyze this news about {entity} and assess its potential impact.
364
-
365
- News: {news}
366
-
367
- Classify the impact into one of these categories:
368
- 1. "Значительный риск убытков" (Significant loss risk)
369
- 2. "Умеренный риск убытков" (Moderate loss risk)
370
- 3. "Незначительный риск убытков" (Minor loss risk)
371
- 4. "Вероятность прибыли" (Potential profit)
372
- 5. "Неопределенный эффект" (Uncertain effect)
373
-
374
- Format your response exactly as:
375
- Impact: [category]
376
- Reasoning: [explanation in 2-3 sentences]
377
- """
378
-
379
- prompt = PromptTemplate(template=template, input_variables=["entity", "news"])
380
- chain = prompt | self.groq
381
-
382
- response = chain.invoke({
383
- "entity": entity,
384
- "news": text
385
- })
386
-
387
- # Parse response
388
- response_text = response.content if hasattr(response, 'content') else str(response)
389
-
390
- if "Impact:" in response_text and "Reasoning:" in response_text:
391
- parts = response_text.split("Reasoning:")
392
- impact = parts[0].split("Impact:")[1].strip()
393
- reasoning = parts[1].strip()
394
- else:
395
- impact = "Неопределенный эффект"
396
- reasoning = "Не удалось определить влияние"
397
-
398
- return impact, reasoning
399
-
400
- except Exception as e:
401
- logger.error(f"Impact estimation error: {str(e)}")
402
- return "Неопределенный эффект", f"Ошибка анализа: {str(e)}"
403
-
404
-
405
- @spaces.GPU(duration=60)
406
- def process_text(self, text, entity):
407
- """Process text with Groq-driven sentiment analysis"""
408
- try:
409
- translated_text = self._translate_text(text)
410
- initial_sentiment = self.analyze_sentiment(translated_text)
411
-
412
- impact = "Неопределенный эффект"
413
- reasoning = ""
414
-
415
- # Always get Groq analysis for all texts
416
- impact, reasoning = self.estimate_impact(translated_text, entity)
417
- reasoning = self.rutranslator(reasoning)[0]['translation_text']
418
-
419
- # Override sentiment based on Groq impact
420
- final_sentiment = initial_sentiment
421
- if impact == "Вероятность прибыли":
422
- final_sentiment = "Positive"
423
-
424
- event_type, event_summary = self.detect_events(text, entity)
425
-
426
- return {
427
- 'translated_text': translated_text,
428
- 'sentiment': final_sentiment,
429
- 'impact': impact,
430
- 'reasoning': reasoning,
431
- 'event_type': event_type,
432
- 'event_summary': event_summary
433
- }
434
-
435
- except Exception as e:
436
- logger.error(f"Text processing error: {str(e)}")
437
- return {
438
- 'translated_text': '',
439
- 'sentiment': 'Neutral',
440
- 'impact': 'Неопределенный эффект',
441
- 'reasoning': f'Ошибка обработки: {str(e)}',
442
- 'event_type': 'Нет',
443
- 'event_summary': ''
444
- }
445
-
446
-
447
-
448
-
449
- @spaces.GPU(duration=20)
450
  def detect_events(self, text, entity):
 
451
  if not text or not entity:
452
  return "Нет", "Invalid input"
453
 
454
  try:
455
- # Improved prompt for MT5
456
- prompt = f"""<s>Analyze this news about {entity}:
457
-
458
- Text: {text}
459
-
460
- Classify this news into ONE of these categories:
461
- 1. "Отчетность" if about: financial reports, revenue, profit, EBITDA, financial results, quarterly/annual reports
462
- 2. "Суд" if about: court cases, lawsuits, arbitration, bankruptcy, legal proceedings
463
- 3. "РЦБ" if about: bonds, securities, defaults, debt restructuring, coupon payments
464
- 4. "Нет" if none of the above
465
-
466
- Provide classification and 2-3 sentence summary focusing on key facts.
467
-
468
- Format response exactly as:
469
- Category: [category name]
470
- Summary: [brief factual summary]</s>"""
471
-
472
  inputs = self.tokenizer(
473
  prompt,
474
  return_tensors="pt",
@@ -479,29 +227,22 @@ class EventDetector:
479
 
480
  outputs = self.model.generate(
481
  **inputs,
482
- max_length=200,
483
  num_return_sequences=1,
484
- do_sample=False,
485
- temperature=0.7,
486
- top_p=0.9,
487
- no_repeat_ngram_size=3
488
  )
489
 
490
  response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
491
 
492
- # Extract category and summary
493
- if "Category:" in response and "Summary:" in response:
494
- parts = response.split("Summary:")
495
- category = parts[0].split("Category:")[1].strip()
496
- summary = parts[1].strip()
497
-
498
- # Validate category
499
- valid_categories = {"Отчетность", "Суд", "РЦБ", "Нет"}
500
- category = category if category in valid_categories else "Нет"
501
-
502
- return category, summary
503
-
504
- return "Нет", "Could not classify event"
505
 
506
  except Exception as e:
507
  logger.error(f"Event detection error: {str(e)}")
@@ -514,7 +255,6 @@ class EventDetector:
514
  self.translator = None
515
  self.finbert = None
516
  self.roberta = None
517
- self.finbert_tone = None
518
  torch.cuda.empty_cache()
519
  self.initialized = False
520
  logger.info("Cleaned up GPU resources")
@@ -522,6 +262,7 @@ class EventDetector:
522
  logger.error(f"Error in cleanup: {str(e)}")
523
 
524
  def create_visualizations(df):
 
525
  if df is None or df.empty:
526
  return None, None
527
 
@@ -547,426 +288,94 @@ def create_visualizations(df):
547
  except Exception as e:
548
  logger.error(f"Visualization error: {e}")
549
  return None, None
550
-
551
-
552
- @spaces.GPU
553
- def process_file(file_obj):
554
- try:
555
- logger.info("Starting to read Excel file...")
556
- df = pd.read_excel(file_obj, sheet_name='Публикации')
557
- logger.info(f"Successfully read Excel file. Shape: {df.shape}")
558
-
559
- # Deduplication
560
- original_count = len(df)
561
- df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
562
- logger.info(f"Removed {original_count - len(df)} duplicate entries")
563
-
564
- detector = EventDetector()
565
- processed_rows = []
566
- total = len(df)
567
-
568
- # Process in smaller batches with quota management
569
- BATCH_SIZE = 3 # Reduced batch size
570
- QUOTA_WAIT_TIME = 60 # Wait time when quota is exceeded
571
-
572
- for batch_start in range(0, total, BATCH_SIZE):
573
- try:
574
- batch_end = min(batch_start + BATCH_SIZE, total)
575
- batch = df.iloc[batch_start:batch_end]
576
-
577
- # Initialize models for batch
578
- if not detector.initialized:
579
- detector.initialize_models()
580
- time.sleep(1) # Wait after initialization
581
-
582
- for idx, row in batch.iterrows():
583
- try:
584
- text = str(row.get('Выдержки из текста', ''))
585
- if not text.strip():
586
- continue
587
-
588
- entity = str(row.get('Объект', ''))
589
- if not entity.strip():
590
- continue
591
-
592
- # Process with GPU quota management
593
- event_type = "Нет"
594
- event_summary = ""
595
- sentiment = "Neutral"
596
-
597
- try:
598
- event_type, event_summary = detector.detect_events(text, entity)
599
- time.sleep(1) # Wait between GPU operations
600
- sentiment = detector.analyze_sentiment(text)
601
- except Exception as e:
602
- if "GPU quota" in str(e):
603
- logger.warning("GPU quota exceeded, waiting...")
604
- time.sleep(QUOTA_WAIT_TIME)
605
- continue
606
- else:
607
- raise e
608
-
609
- processed_rows.append({
610
- 'Объект': entity,
611
- 'Заголовок': str(row.get('Заголовок', '')),
612
- 'Sentiment': sentiment,
613
- 'Event_Type': event_type,
614
- 'Event_Summary': event_summary,
615
- 'Текст': text[:1000]
616
- })
617
-
618
- logger.info(f"Processed {idx + 1}/{total} rows")
619
-
620
- except Exception as e:
621
- logger.error(f"Error processing row {idx}: {str(e)}")
622
- continue
623
-
624
- # Create intermediate results
625
- if processed_rows:
626
- intermediate_df = pd.DataFrame(processed_rows)
627
- yield (
628
- intermediate_df,
629
- None,
630
- None,
631
- f"Обработано {len(processed_rows)}/{total} строк"
632
- )
633
-
634
- # Wait between batches
635
- time.sleep(2)
636
-
637
- # Cleanup GPU resources after each batch
638
- torch.cuda.empty_cache()
639
-
640
- except Exception as e:
641
- logger.error(f"Batch processing error: {str(e)}")
642
- if "GPU quota" in str(e):
643
- time.sleep(QUOTA_WAIT_TIME)
644
- continue
645
-
646
- # Final results
647
- if processed_rows:
648
- result_df = pd.DataFrame(processed_rows)
649
- fig_sentiment, fig_events = create_visualizations(result_df)
650
- return result_df, fig_sentiment, fig_events, "Обработка завершена!"
651
- else:
652
- return None, None, None, "Нет обработанных данных"
653
-
654
- except Exception as e:
655
- logger.error(f"File processing error: {str(e)}")
656
- raise
657
-
658
- def create_output_file(df, uploaded_file):
659
- """Create Excel file with multiple sheets from processed DataFrame"""
660
- try:
661
- wb = load_workbook("sample_file.xlsx")
662
-
663
- # 1. Update 'Публикации' sheet
664
- ws = wb['Публикации']
665
- for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), start=1):
666
- for c_idx, value in enumerate(row, start=1):
667
- ws.cell(row=r_idx, column=c_idx, value=value)
668
-
669
- # 2. Update 'Мониторинг' sheet with events
670
- ws = wb['Мониторинг']
671
- row_idx = 4
672
- events_df = df[df['Event_Type'] != 'Нет'].copy()
673
- for _, row in events_df.iterrows():
674
- ws.cell(row=row_idx, column=5, value=row['Объект'])
675
- ws.cell(row=row_idx, column=6, value=row['Заголовок'])
676
- ws.cell(row=row_idx, column=7, value=row['Event_Type'])
677
- ws.cell(row=row_idx, column=8, value=row['Event_Summary'])
678
- ws.cell(row=row_idx, column=9, value=row['Выдержки из текста'])
679
- row_idx += 1
680
-
681
- # 3. Update 'Сводка' sheet
682
- ws = wb['Сводка']
683
- unique_entities = df['Объект'].unique()
684
- entity_stats = []
685
- for entity in unique_entities:
686
- entity_df = df[df['Объект'] == entity]
687
- stats = {
688
- 'Объект': entity,
689
- 'Всего': len(entity_df),
690
- 'Негативные': len(entity_df[entity_df['Sentiment'] == 'Negative']),
691
- 'Позитивные': len(entity_df[entity_df['Sentiment'] == 'Positive'])
692
- }
693
-
694
- # Get most severe impact for entity
695
- negative_df = entity_df[entity_df['Sentiment'] == 'Negative']
696
- if len(negative_df) > 0:
697
- impacts = negative_df['Impact'].dropna()
698
- if len(impacts) > 0:
699
- stats['Impact'] = impacts.iloc[0]
700
- else:
701
- stats['Impact'] = 'Неопределенный эффект'
702
- else:
703
- stats['Impact'] = 'Неопределенный эффект'
704
-
705
- entity_stats.append(stats)
706
-
707
- # Sort by number of negative mentions
708
- entity_stats = sorted(entity_stats, key=lambda x: x['Негативные'], reverse=True)
709
-
710
- # Write to sheet
711
- row_idx = 4 # Starting row in Сводка sheet
712
- for stats in entity_stats:
713
- ws.cell(row=row_idx, column=5, value=stats['Объект'])
714
- ws.cell(row=row_idx, column=6, value=stats['Всего'])
715
- ws.cell(row=row_idx, column=7, value=stats['Негативные'])
716
- ws.cell(row=row_idx, column=8, value=stats['Позитивные'])
717
- ws.cell(row=row_idx, column=9, value=stats['Impact'])
718
- row_idx += 1
719
-
720
- # 4. Update 'Значимые' sheet
721
- ws = wb['Значимые']
722
- row_idx = 3
723
- sentiment_df = df[df['Sentiment'].isin(['Negative', 'Positive'])].copy()
724
- for _, row in sentiment_df.iterrows():
725
- ws.cell(row=row_idx, column=3, value=row['Объект'])
726
- ws.cell(row=row_idx, column=4, value='релевантно')
727
- ws.cell(row=row_idx, column=5, value=row['Sentiment'])
728
- ws.cell(row=row_idx, column=6, value=row.get('Impact', '-'))
729
- ws.cell(row=row_idx, column=7, value=row['Заголовок'])
730
- ws.cell(row=row_idx, column=8, value=row['Выдержки из текста'])
731
- row_idx += 1
732
-
733
- # 5. Update 'Анализ' sheet
734
- ws = wb['Анализ']
735
- row_idx = 4
736
- negative_df = df[df['Sentiment'] == 'Negative'].copy()
737
- for _, row in negative_df.iterrows():
738
- ws.cell(row=row_idx, column=5, value=row['Объект'])
739
- ws.cell(row=row_idx, column=6, value=row['Заголовок'])
740
- ws.cell(row=row_idx, column=7, value="Риск убытка")
741
- ws.cell(row=row_idx, column=8, value=row.get('Reasoning', '-'))
742
- ws.cell(row=row_idx, column=9, value=row['Выдержки из текста'])
743
- row_idx += 1
744
-
745
- # 6. Update 'Тех.приложение' sheet
746
- if 'Тех.приложение' not in wb.sheetnames:
747
- wb.create_sheet('Тех.приложение')
748
- ws = wb['Тех.приложение']
749
-
750
- tech_cols = ['Объект', 'Заголовок', 'Выдержки из текста', 'Translated', 'Sentiment', 'Impact', 'Reasoning']
751
- tech_df = df[tech_cols].copy()
752
-
753
- for r_idx, row in enumerate(dataframe_to_rows(tech_df, index=False, header=True), start=1):
754
- for c_idx, value in enumerate(row, start=1):
755
- ws.cell(row=r_idx, column=c_idx, value=value)
756
-
757
- # Save workbook
758
- output = io.BytesIO()
759
- wb.save(output)
760
- output.seek(0)
761
- return output
762
-
763
- except Exception as e:
764
- logger.error(f"Error creating output file: {str(e)}")
765
- logger.error(f"DataFrame shape: {df.shape}")
766
- logger.error(f"Available columns: {df.columns.tolist()}")
767
- return None
768
-
769
 
770
  def create_interface():
 
771
  control = ProcessControl()
772
 
773
- with gr.Blocks(theme=gr.themes.Soft()) as app:
774
- # Create state for file data
775
- current_file = gr.State(None)
776
-
777
- gr.Markdown("# AI-анализ мониторинга новостей v.2.0 + добавка")
778
 
779
  with gr.Row():
780
  file_input = gr.File(
781
  label="Загрузите Excel файл",
782
- file_types=[".xlsx"],
783
- type="binary"
784
  )
785
 
786
  with gr.Row():
787
- with gr.Column(scale=1):
788
- analyze_btn = gr.Button(
789
- "▶️ Начать анализ",
790
- variant="primary",
791
- size="lg"
792
- )
793
- with gr.Column(scale=1):
794
- stop_btn = gr.Button(
795
- "⏹️ Остановить",
796
- variant="stop",
797
- size="lg"
798
- )
799
 
800
- with gr.Row():
801
- status_box = gr.Textbox(
802
- label="Статус дедупликации",
803
- interactive=False,
804
- value=""
805
- )
806
-
807
- with gr.Row():
808
- progress = gr.Textbox(
809
- label="Статус обработки",
810
- interactive=False,
811
- value="Ожидание файла..."
812
- )
813
 
814
  with gr.Row():
815
- stats = gr.DataFrame(
816
- label="Результаты анализа",
817
- interactive=False,
818
- wrap=True
819
- )
820
-
821
- with gr.Row():
822
- with gr.Column(scale=1):
823
- sentiment_plot = gr.Plot(label="Распределение тональности")
824
- with gr.Column(scale=1):
825
- events_plot = gr.Plot(label="Распределение событий")
826
-
827
- # Create a download row with file component only
828
- with gr.Row():
829
- file_output = gr.File(
830
- label="Скачать результаты",
831
- visible=True,
832
- interactive=True
833
- )
834
-
835
  def stop_processing():
836
  control.request_stop()
837
  return "Остановка обработки..."
838
-
839
 
840
- @spaces.GPU(duration=300)
841
- def process_and_download(file_bytes):
842
- if file_bytes is None:
843
- gr.Warning("Пожалуйста, загрузите файл")
844
- return (pd.DataFrame(), None, None, None, "Ожидание файла...", "")
845
-
846
- detector = None
847
- gpu_manager = GPUTaskManager(
848
- max_retries=3,
849
- retry_delay=30,
850
- cleanup_callback=lambda: detector.cleanup() if detector else None
851
- )
852
-
853
  try:
854
- file_obj = io.BytesIO(file_bytes)
855
- logger.info("File loaded into BytesIO successfully")
856
-
857
- detector = EventDetector()
858
-
859
- # Read and deduplicate data with retry
860
- async def read_and_dedupe():
861
- df = pd.read_excel(file_obj, sheet_name='Публикации')
862
- original_count = len(df)
863
- df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
864
- return df, original_count
865
 
866
- df, original_count = await gpu_manager.run_with_retry(read_and_dedupe)
867
-
868
- # Process in smaller batches with better error handling
869
  processed_rows = []
870
- batches = gpu_manager.batch_process(list(df.iterrows()), batch_size=3)
871
 
872
- for batch in batches:
 
873
  if control.should_stop():
874
  break
875
 
876
- try:
877
- # Process batch with retry mechanism
878
- async def process_batch():
879
- batch_results = []
880
- for idx, row in batch:
881
- text = str(row.get('Выдержки из текста', '')).strip()
882
- entity = str(row.get('Объект', '')).strip()
883
-
884
- if text and entity:
885
- results = detector.process_text(text, entity)
886
- batch_results.append({
887
- 'Объект': entity,
888
- 'Заголовок': str(row.get('Заголовок', '')),
889
- 'Translated': results['translated_text'],
890
- 'Sentiment': results['sentiment'],
891
- 'Impact': results['impact'],
892
- 'Reasoning': results['reasoning'],
893
- 'Event_Type': results['event_type'],
894
- 'Event_Summary': results['event_summary'],
895
- 'Выдержки из текста': text
896
- })
897
- return batch_results
898
-
899
- batch_results = await gpu_manager.run_with_retry(process_batch)
900
- processed_rows.extend(batch_results)
901
-
902
- # Create intermediate results
903
- if processed_rows:
904
- result_df = pd.DataFrame(processed_rows)
905
- yield (
906
- result_df,
907
- None, None, None,
908
- f"Обработано {len(processed_rows)}/{len(df)} строк",
909
- f"Удалено {original_count - len(df)} дубликатов"
910
- )
911
-
912
- except Exception as e:
913
- if gpu_manager.is_gpu_error(e):
914
- logger.warning(f"GPU error in batch processing: {str(e)}")
915
- continue
916
- else:
917
- logger.error(f"Non-GPU error in batch processing: {str(e)}")
918
-
919
- finally:
920
- torch.cuda.empty_cache()
921
-
922
- # Create final results
923
- if processed_rows:
924
- result_df = pd.DataFrame(processed_rows)
925
- output_bytes_io = create_output_file(result_df, file_obj)
926
- fig_sentiment, fig_events = create_visualizations(result_df)
927
 
928
- if output_bytes_io:
929
- temp_file = "results.xlsx"
930
- with open(temp_file, "wb") as f:
931
- f.write(output_bytes_io.getvalue())
932
- return (
933
- result_df,
934
- fig_sentiment,
935
- fig_events,
936
- temp_file,
937
- "Обработка завершена!",
938
- f"Удалено {original_count - len(df)} дубликатов"
939
- )
940
-
941
- return (pd.DataFrame(), None, None, None, "Нет обработанных данных", "")
 
 
 
 
942
 
943
  except Exception as e:
944
  error_msg = f"Ошибка анализа: {str(e)}"
945
  logger.error(error_msg)
946
- return (pd.DataFrame(), None, None, None, error_msg, "")
947
 
948
  finally:
949
- if detector:
950
  detector.cleanup()
951
 
952
  stop_btn.click(fn=stop_processing, outputs=[progress])
953
 
954
- # Main processing - simplified outputs
955
  analyze_btn.click(
956
- fn=process_and_download,
957
  inputs=[file_input],
958
- outputs=[
959
- stats,
960
- sentiment_plot,
961
- events_plot,
962
- file_output,
963
- progress,
964
- status_box
965
- ]
966
  )
967
-
968
  return app
969
 
970
  if __name__ == "__main__":
971
  app = create_interface()
972
- app.launch(share=True)
 
1
  import gradio as gr
 
2
  import pandas as pd
3
  import torch
4
  from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, AutoModel
 
8
  from rapidfuzz import fuzz
9
  import time
10
  import os
 
 
 
 
 
 
 
 
11
  from typing import List, Set, Tuple
12
+ import asyncio
13
 
14
+ # Configure logging
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
 
18
  class GPUTaskManager:
19
  def __init__(self, max_retries=3, retry_delay=30, cleanup_callback=None):
20
  self.max_retries = max_retries
 
22
  self.cleanup_callback = cleanup_callback
23
 
24
  async def run_with_retry(self, task_func, *args, **kwargs):
 
25
  for attempt in range(self.max_retries):
26
  try:
27
  return await task_func(*args, **kwargs)
28
  except Exception as e:
29
+ if "CUDA out of memory" in str(e) or "GPU quota" in str(e):
30
  if attempt < self.max_retries - 1:
31
  if self.cleanup_callback:
32
  self.cleanup_callback()
 
37
 
38
  @staticmethod
39
  def batch_process(items, batch_size=3):
 
40
  return [items[i:i + batch_size] for i in range(0, len(items), batch_size)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  class ProcessControl:
43
  def __init__(self):
 
64
  device = "cuda" if torch.cuda.is_available() else "cpu"
65
  logger.info(f"Initializing models on device: {device}")
66
 
67
+ self.device = device
68
+ self.initialize_models()
69
 
70
  # Initialize transformer for declusterization
71
  self.tokenizer_cluster = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
72
  self.model_cluster = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2').to(device)
73
 
 
74
  self.initialized = True
75
  logger.info("All models initialized successfully")
76
 
 
78
  logger.error(f"Error in EventDetector initialization: {str(e)}")
79
  raise
80
 
81
+ def initialize_models(self):
82
+ """Initialize models with proper error handling"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  try:
84
+ # Initialize translation models
85
+ self.translator = pipeline(
86
+ "translation",
87
+ model="Helsinki-NLP/opus-mt-ru-en",
88
+ device=self.device
89
+ )
90
 
91
+ self.rutranslator = pipeline(
92
+ "translation",
93
+ model="Helsinki-NLP/opus-mt-en-ru",
94
+ device=self.device
95
+ )
96
+
97
+ # Initialize sentiment models
98
+ self.finbert = pipeline(
99
+ "sentiment-analysis",
100
+ model="ProsusAI/finbert",
101
+ device=self.device,
102
+ truncation=True,
103
+ max_length=512
104
+ )
105
 
106
+ self.roberta = pipeline(
107
+ "sentiment-analysis",
108
+ model="cardiffnlp/twitter-roberta-base-sentiment",
109
+ device=self.device,
110
+ truncation=True,
111
+ max_length=512
112
+ )
113
+
114
+ # Initialize MT5 model
115
+ self.model_name = "google/mt5-small"
116
+ self.tokenizer = AutoTokenizer.from_pretrained(
117
+ self.model_name,
118
+ legacy=True
119
+ )
120
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name).to(self.device)
121
 
122
+ except Exception as e:
123
+ logger.error(f"Model initialization error: {str(e)}")
124
+ raise
125
+
126
+ def process_text(self, text, entity):
127
+ """Process text with simplified analysis"""
128
+ try:
129
+ translated_text = self._translate_text(text)
130
+ sentiment = self.analyze_sentiment(translated_text)
131
+ event_type, event_summary = self.detect_events(text, entity)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
+ return {
134
+ 'translated_text': translated_text,
135
+ 'sentiment': sentiment,
136
+ 'impact': 'Неопределенный эффект',
137
+ 'reasoning': 'Автоматический анализ',
138
+ 'event_type': event_type,
139
+ 'event_summary': event_summary
140
+ }
141
 
142
  except Exception as e:
143
+ logger.error(f"Text processing error: {str(e)}")
144
+ return {
145
+ 'translated_text': '',
146
+ 'sentiment': 'Neutral',
147
+ 'impact': 'Неопределенный эффект',
148
+ 'reasoning': f'Ошибка обработки: {str(e)}',
149
+ 'event_type': 'Нет',
150
+ 'event_summary': ''
151
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
 
153
  def _translate_text(self, text):
154
+ """Translate Russian text to English with proper error handling"""
155
  try:
156
  if not text or not isinstance(text, str):
157
  return ""
 
160
  if not text:
161
  return ""
162
 
 
163
  max_length = 450
164
  chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
165
  translated_chunks = []
 
167
  for chunk in chunks:
168
  result = self.translator(chunk)[0]['translation_text']
169
  translated_chunks.append(result)
170
+ time.sleep(0.1)
171
 
172
  return " ".join(translated_chunks)
173
 
 
175
  logger.error(f"Translation error: {str(e)}")
176
  return text
177
 
 
178
  def analyze_sentiment(self, text):
179
+ """Simplified sentiment analysis"""
180
  try:
181
+ if not text or not isinstance(text, str) or not text.strip():
182
  return "Neutral"
183
 
 
 
 
 
 
184
  finbert_result = self.finbert(text)[0]
185
  roberta_result = self.roberta(text)[0]
 
186
 
187
+ # Simple majority voting
188
+ sentiments = []
189
+ for result in [finbert_result, roberta_result]:
190
  label = result['label'].lower()
191
+ if 'positive' in label or 'pos' in label:
192
+ sentiments.append("Positive")
193
+ elif 'negative' in label or 'neg' in label:
194
+ sentiments.append("Negative")
 
 
 
 
 
 
 
 
195
  else:
196
+ sentiments.append("Neutral")
 
 
 
 
 
 
 
197
 
198
+ # Count occurrences
 
 
 
 
 
199
  pos_count = sentiments.count("Positive")
200
+ neg_count = sentiments.count("Negative")
201
+
202
+ if neg_count > pos_count:
203
+ return "Negative"
204
+ elif pos_count > neg_count:
205
  return "Positive"
 
206
  return "Neutral"
207
 
208
  except Exception as e:
209
  logger.error(f"Sentiment analysis error: {str(e)}")
210
  return "Neutral"
211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  def detect_events(self, text, entity):
213
+ """Simplified event detection"""
214
  if not text or not entity:
215
  return "Нет", "Invalid input"
216
 
217
  try:
218
+ prompt = f"<s>Classify news about {entity}: {text}</s>"
219
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  inputs = self.tokenizer(
221
  prompt,
222
  return_tensors="pt",
 
227
 
228
  outputs = self.model.generate(
229
  **inputs,
230
+ max_length=100,
231
  num_return_sequences=1,
232
+ do_sample=False
 
 
 
233
  )
234
 
235
  response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
236
 
237
+ # Simple classification based on keywords
238
+ if any(word in response.lower() for word in ['financial', 'revenue', 'profit']):
239
+ return "Отчетность", "Financial report detected"
240
+ elif any(word in response.lower() for word in ['court', 'lawsuit', 'legal']):
241
+ return "Суд", "Legal proceedings detected"
242
+ elif any(word in response.lower() for word in ['bonds', 'securities', 'debt']):
243
+ return "РЦБ", "Securities-related news detected"
244
+
245
+ return "Нет", "No specific event detected"
 
 
 
 
246
 
247
  except Exception as e:
248
  logger.error(f"Event detection error: {str(e)}")
 
255
  self.translator = None
256
  self.finbert = None
257
  self.roberta = None
 
258
  torch.cuda.empty_cache()
259
  self.initialized = False
260
  logger.info("Cleaned up GPU resources")
 
262
  logger.error(f"Error in cleanup: {str(e)}")
263
 
264
  def create_visualizations(df):
265
+ """Create visualization plots"""
266
  if df is None or df.empty:
267
  return None, None
268
 
 
288
  except Exception as e:
289
  logger.error(f"Visualization error: {e}")
290
  return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
 
292
  def create_interface():
293
+ """Create Gradio interface"""
294
  control = ProcessControl()
295
 
296
+ with gr.Blocks() as app:
297
+ gr.Markdown("# AI-анализ мониторинга новостей v.2.0")
 
 
 
298
 
299
  with gr.Row():
300
  file_input = gr.File(
301
  label="Загрузите Excel файл",
302
+ file_types=[".xlsx"]
 
303
  )
304
 
305
  with gr.Row():
306
+ analyze_btn = gr.Button("▶️ Начать анализ", variant="primary")
307
+ stop_btn = gr.Button("⏹️ Остановить", variant="stop")
 
 
 
 
 
 
 
 
 
 
308
 
309
+ progress = gr.Textbox(
310
+ label="Статус обработки",
311
+ value="Ожидание файла..."
312
+ )
313
+
314
+ stats = gr.DataFrame(label="Результаты анализа")
 
 
 
 
 
 
 
315
 
316
  with gr.Row():
317
+ sentiment_plot = gr.Plot(label="Распределение тональности")
318
+ events_plot = gr.Plot(label="Распределение событий")
319
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  def stop_processing():
321
  control.request_stop()
322
  return "Остановка обработки..."
 
323
 
324
+ def process_file(file):
 
 
 
 
 
 
 
 
 
 
 
 
325
  try:
326
+ if file is None:
327
+ return None, None, None, "Пожалуйста, загрузите файл"
 
 
 
 
 
 
 
 
 
328
 
329
+ df = pd.read_excel(file.name)
330
+ detector = EventDetector()
 
331
  processed_rows = []
 
332
 
333
+ total = len(df)
334
+ for idx, row in df.iterrows():
335
  if control.should_stop():
336
  break
337
 
338
+ text = str(row.get('Выдержки из текста', '')).strip()
339
+ entity = str(row.get('Объект', '')).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
 
341
+ if text and entity:
342
+ results = detector.process_text(text, entity)
343
+ processed_rows.append({
344
+ 'Объект': entity,
345
+ 'Заголовок': str(row.get('Заголовок', '')),
346
+ 'Sentiment': results['sentiment'],
347
+ 'Event_Type': results['event_type'],
348
+ 'Event_Summary': results['event_summary'],
349
+ 'Текст': text[:1000]
350
+ })
351
+
352
+ if len(processed_rows) % 10 == 0:
353
+ yield pd.DataFrame(processed_rows), None, None, f"Обработано {len(processed_rows)}/{total} строк"
354
+
355
+ final_df = pd.DataFrame(processed_rows)
356
+ fig_sentiment, fig_events = create_visualizations(final_df)
357
+
358
+ return final_df, fig_sentiment, fig_events, "Обработка завершена!"
359
 
360
  except Exception as e:
361
  error_msg = f"Ошибка анализа: {str(e)}"
362
  logger.error(error_msg)
363
+ return None, None, None, error_msg
364
 
365
  finally:
366
+ if 'detector' in locals():
367
  detector.cleanup()
368
 
369
  stop_btn.click(fn=stop_processing, outputs=[progress])
370
 
 
371
  analyze_btn.click(
372
+ fn=process_file,
373
  inputs=[file_input],
374
+ outputs=[stats, sentiment_plot, events_plot, progress]
 
 
 
 
 
 
 
375
  )
376
+
377
  return app
378
 
379
  if __name__ == "__main__":
380
  app = create_interface()
381
+ app.launch()