pentarosarium commited on
Commit
cdaad51
·
1 Parent(s): 59bee7f
Files changed (1) hide show
  1. app.py +91 -55
app.py CHANGED
@@ -73,7 +73,7 @@ class FallbackLLMSystem:
73
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
74
  self.model = self.model.to(self.device)
75
 
76
- st.success(f"Successfully initialized MT5 model on {self.device}")
77
 
78
  except Exception as e:
79
  st.error(f"Error initializing MT5: {str(e)}")
@@ -230,10 +230,10 @@ class QwenSystem:
230
  )
231
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
232
 
233
- st.success(f"Successfully initialized Qwen2.5 model")
234
 
235
  except Exception as e:
236
- st.error(f"Error initializing Qwen2.5: {str(e)}")
237
  raise
238
 
239
  def invoke(self, messages):
@@ -347,9 +347,9 @@ class EventDetectionSystem:
347
  model="yiyanghkust/finbert-tone",
348
  return_all_scores=True
349
  )
350
- st.success("BERT models initialized for event detection")
351
  except Exception as e:
352
- st.error(f"Error initializing BERT models: {str(e)}")
353
  raise
354
 
355
  def detect_event_type(self, text, entity):
@@ -404,70 +404,106 @@ class EventDetectionSystem:
404
 
405
  class TranslationSystem:
406
  def __init__(self):
407
- """Initialize translation system using Helsinki NLP model"""
408
  try:
409
  self.translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
410
- st.success("Translation system initialized")
 
 
 
411
  except Exception as e:
412
- st.error(f"Error initializing translator: {str(e)}")
413
  raise
414
-
415
- def translate_text(self, text):
416
- if pd.isna(text) or not isinstance(text, str) or not text.strip():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
  return str(text) if pd.notna(text) else ""
418
-
419
  text = str(text).strip()
420
  if not text:
421
  return ""
422
-
423
  try:
424
- max_chunk_size = 450
425
- chunks = self._split_into_chunks(text, max_chunk_size)
426
  translated_chunks = []
427
-
 
428
  for chunk in chunks:
429
  if not chunk.strip():
430
  continue
431
-
432
- try:
433
- result = self.translator(chunk, max_length=512)
434
- if result and isinstance(result, list) and len(result) > 0:
435
- translated_chunks.append(result[0].get('translation_text', chunk))
436
- else:
437
- translated_chunks.append(chunk)
438
- except Exception as e:
439
- st.warning(f"Chunk translation error: {str(e)}")
440
- translated_chunks.append(chunk)
441
- time.sleep(0.1)
442
-
443
- return ' '.join(translated_chunks)
444
-
445
  except Exception as e:
446
  st.warning(f"Translation error: {str(e)}")
447
- return text
448
-
449
- def _split_into_chunks(self, text, max_length):
450
- sentences = []
451
- for s in text.replace('!', '.').replace('?', '.').split('.'):
452
- s = s.strip()
453
- if s:
454
- if len(s) > max_length:
455
- # Split long sentences into smaller chunks
456
- words = s.split()
457
- current_chunk = []
458
- current_length = 0
459
- for word in words:
460
- if current_length + len(word) > max_length:
461
- sentences.append(' '.join(current_chunk))
462
- current_chunk = [word]
463
- current_length = len(word)
464
- else:
465
- current_chunk.append(word)
466
- current_length += len(word) + 1
467
- if current_chunk:
468
- sentences.append(' '.join(current_chunk))
469
- else:
470
- sentences.append(s)
471
 
472
 
473
 
@@ -962,7 +998,7 @@ def main():
962
  st.set_page_config(layout="wide")
963
 
964
  with st.sidebar:
965
- st.title("::: AI-анализ мониторинга новостей (v.3.56):::")
966
  st.subheader("по материалам СКАН-ИНТЕРФАКС")
967
 
968
  model_choice = st.radio(
 
73
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
74
  self.model = self.model.to(self.device)
75
 
76
+ st.success(f"Запустил MT5-модель на {self.device}")
77
 
78
  except Exception as e:
79
  st.error(f"Error initializing MT5: {str(e)}")
 
230
  )
231
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
232
 
233
+ st.success(f"запустил Qwen2.5 model")
234
 
235
  except Exception as e:
236
+ st.error(f"ошибка запуска Qwen2.5: {str(e)}")
237
  raise
238
 
239
  def invoke(self, messages):
 
347
  model="yiyanghkust/finbert-tone",
348
  return_all_scores=True
349
  )
350
+ st.success("BERT-модели запущены для детекции новостей")
351
  except Exception as e:
352
+ st.error(f"Ошибка запуска BERT: {str(e)}")
353
  raise
354
 
355
  def detect_event_type(self, text, entity):
 
404
 
405
  class TranslationSystem:
406
  def __init__(self):
407
+ """Initialize translation system using Helsinki NLP model with fallback options"""
408
  try:
409
  self.translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
410
+ # Initialize fallback translator
411
+ self.fallback_translator = GoogleTranslator(source='ru', target='en')
412
+ self.legacy_translator = LegacyTranslator()
413
+ st.success("Запустил систему перевода")
414
  except Exception as e:
415
+ st.error(f"Ошибка запуска перевода: {str(e)}")
416
  raise
417
+
418
+ def _split_into_chunks(self, text: str, max_length: int = 450) -> list:
419
+ """Split text into chunks while preserving word boundaries"""
420
+ words = text.split()
421
+ chunks = []
422
+ current_chunk = []
423
+ current_length = 0
424
+
425
+ for word in words:
426
+ word_length = len(word)
427
+ if current_length + word_length + 1 <= max_length:
428
+ current_chunk.append(word)
429
+ current_length += word_length + 1
430
+ else:
431
+ if current_chunk:
432
+ chunks.append(' '.join(current_chunk))
433
+ current_chunk = [word]
434
+ current_length = word_length
435
+
436
+ if current_chunk:
437
+ chunks.append(' '.join(current_chunk))
438
+
439
+ return chunks
440
+
441
+ def _translate_chunk_with_retries(self, chunk: str, max_retries: int = 3) -> str:
442
+ """Attempt translation with multiple fallback options"""
443
+ if not chunk or not chunk.strip():
444
+ return ""
445
+
446
+ for attempt in range(max_retries):
447
+ try:
448
+ # First try Helsinki NLP
449
+ result = self.translator(chunk, max_length=512)
450
+ if result and isinstance(result, list) and len(result) > 0:
451
+ translated = result[0].get('translation_text')
452
+ if translated and isinstance(translated, str):
453
+ return translated
454
+
455
+ # First fallback: Google Translator
456
+ translated = self.fallback_translator.translate(chunk)
457
+ if translated and isinstance(translated, str):
458
+ return translated
459
+
460
+ # Second fallback: Legacy Google Translator
461
+ translated = self.legacy_translator.translate(chunk, src='ru', dest='en').text
462
+ if translated and isinstance(translated, str):
463
+ return translated
464
+
465
+ except Exception as e:
466
+ if attempt == max_retries - 1:
467
+ st.warning(f"Попробовал перевести {max_retries} раз, не преуспел: {str(e)}")
468
+ time.sleep(1 * (attempt + 1)) # Exponential backoff
469
+
470
+ return chunk # Return original text if all translation attempts fail
471
+
472
+ def translate_text(self, text: str) -> str:
473
+ """Translate text with robust error handling and validation"""
474
+ # Input validation
475
+ if pd.isna(text) or not isinstance(text, str):
476
  return str(text) if pd.notna(text) else ""
477
+
478
  text = str(text).strip()
479
  if not text:
480
  return ""
481
+
482
  try:
483
+ # Split into manageable chunks
484
+ chunks = self._split_into_chunks(text)
485
  translated_chunks = []
486
+
487
+ # Process each chunk with validation
488
  for chunk in chunks:
489
  if not chunk.strip():
490
  continue
491
+
492
+ translated_chunk = self._translate_chunk_with_retries(chunk)
493
+ if translated_chunk: # Only add non-empty translations
494
+ translated_chunks.append(translated_chunk)
495
+ time.sleep(0.1) # Rate limiting
496
+
497
+ # Final validation of results
498
+ if not translated_chunks:
499
+ return text # Return original if no translations succeeded
500
+
501
+ result = ' '.join(translated_chunks)
502
+ return result if result.strip() else text
503
+
 
504
  except Exception as e:
505
  st.warning(f"Translation error: {str(e)}")
506
+ return text # Return original text on error
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
 
508
 
509
 
 
998
  st.set_page_config(layout="wide")
999
 
1000
  with st.sidebar:
1001
+ st.title("::: AI-анализ мониторинга новостей (v.3.57):::")
1002
  st.subheader("по материалам СКАН-ИНТЕРФАКС")
1003
 
1004
  model_choice = st.radio(