Spaces:
Running
Running
Commit
·
cdaad51
1
Parent(s):
59bee7f
3.57
Browse files
app.py
CHANGED
@@ -73,7 +73,7 @@ class FallbackLLMSystem:
|
|
73 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
74 |
self.model = self.model.to(self.device)
|
75 |
|
76 |
-
st.success(f"
|
77 |
|
78 |
except Exception as e:
|
79 |
st.error(f"Error initializing MT5: {str(e)}")
|
@@ -230,10 +230,10 @@ class QwenSystem:
|
|
230 |
)
|
231 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
232 |
|
233 |
-
st.success(f"
|
234 |
|
235 |
except Exception as e:
|
236 |
-
st.error(f"
|
237 |
raise
|
238 |
|
239 |
def invoke(self, messages):
|
@@ -347,9 +347,9 @@ class EventDetectionSystem:
|
|
347 |
model="yiyanghkust/finbert-tone",
|
348 |
return_all_scores=True
|
349 |
)
|
350 |
-
st.success("BERT
|
351 |
except Exception as e:
|
352 |
-
st.error(f"
|
353 |
raise
|
354 |
|
355 |
def detect_event_type(self, text, entity):
|
@@ -404,70 +404,106 @@ class EventDetectionSystem:
|
|
404 |
|
405 |
class TranslationSystem:
|
406 |
def __init__(self):
|
407 |
-
"""Initialize translation system using Helsinki NLP model"""
|
408 |
try:
|
409 |
self.translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
|
410 |
-
|
|
|
|
|
|
|
411 |
except Exception as e:
|
412 |
-
st.error(f"
|
413 |
raise
|
414 |
-
|
415 |
-
def
|
416 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
417 |
return str(text) if pd.notna(text) else ""
|
418 |
-
|
419 |
text = str(text).strip()
|
420 |
if not text:
|
421 |
return ""
|
422 |
-
|
423 |
try:
|
424 |
-
|
425 |
-
chunks = self._split_into_chunks(text
|
426 |
translated_chunks = []
|
427 |
-
|
|
|
428 |
for chunk in chunks:
|
429 |
if not chunk.strip():
|
430 |
continue
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
except Exception as e:
|
446 |
st.warning(f"Translation error: {str(e)}")
|
447 |
-
return text
|
448 |
-
|
449 |
-
def _split_into_chunks(self, text, max_length):
|
450 |
-
sentences = []
|
451 |
-
for s in text.replace('!', '.').replace('?', '.').split('.'):
|
452 |
-
s = s.strip()
|
453 |
-
if s:
|
454 |
-
if len(s) > max_length:
|
455 |
-
# Split long sentences into smaller chunks
|
456 |
-
words = s.split()
|
457 |
-
current_chunk = []
|
458 |
-
current_length = 0
|
459 |
-
for word in words:
|
460 |
-
if current_length + len(word) > max_length:
|
461 |
-
sentences.append(' '.join(current_chunk))
|
462 |
-
current_chunk = [word]
|
463 |
-
current_length = len(word)
|
464 |
-
else:
|
465 |
-
current_chunk.append(word)
|
466 |
-
current_length += len(word) + 1
|
467 |
-
if current_chunk:
|
468 |
-
sentences.append(' '.join(current_chunk))
|
469 |
-
else:
|
470 |
-
sentences.append(s)
|
471 |
|
472 |
|
473 |
|
@@ -962,7 +998,7 @@ def main():
|
|
962 |
st.set_page_config(layout="wide")
|
963 |
|
964 |
with st.sidebar:
|
965 |
-
st.title("::: AI-анализ мониторинга новостей (v.3.
|
966 |
st.subheader("по материалам СКАН-ИНТЕРФАКС")
|
967 |
|
968 |
model_choice = st.radio(
|
|
|
73 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
74 |
self.model = self.model.to(self.device)
|
75 |
|
76 |
+
st.success(f"Запустил MT5-модель на {self.device}")
|
77 |
|
78 |
except Exception as e:
|
79 |
st.error(f"Error initializing MT5: {str(e)}")
|
|
|
230 |
)
|
231 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
232 |
|
233 |
+
st.success(f"запустил Qwen2.5 model")
|
234 |
|
235 |
except Exception as e:
|
236 |
+
st.error(f"ошибка запуска Qwen2.5: {str(e)}")
|
237 |
raise
|
238 |
|
239 |
def invoke(self, messages):
|
|
|
347 |
model="yiyanghkust/finbert-tone",
|
348 |
return_all_scores=True
|
349 |
)
|
350 |
+
st.success("BERT-модели запущены для детекции новостей")
|
351 |
except Exception as e:
|
352 |
+
st.error(f"Ошибка запуска BERT: {str(e)}")
|
353 |
raise
|
354 |
|
355 |
def detect_event_type(self, text, entity):
|
|
|
404 |
|
405 |
class TranslationSystem:
|
406 |
def __init__(self):
|
407 |
+
"""Initialize translation system using Helsinki NLP model with fallback options"""
|
408 |
try:
|
409 |
self.translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
|
410 |
+
# Initialize fallback translator
|
411 |
+
self.fallback_translator = GoogleTranslator(source='ru', target='en')
|
412 |
+
self.legacy_translator = LegacyTranslator()
|
413 |
+
st.success("Запустил систему перевода")
|
414 |
except Exception as e:
|
415 |
+
st.error(f"Ошибка запуска перевода: {str(e)}")
|
416 |
raise
|
417 |
+
|
418 |
+
def _split_into_chunks(self, text: str, max_length: int = 450) -> list:
|
419 |
+
"""Split text into chunks while preserving word boundaries"""
|
420 |
+
words = text.split()
|
421 |
+
chunks = []
|
422 |
+
current_chunk = []
|
423 |
+
current_length = 0
|
424 |
+
|
425 |
+
for word in words:
|
426 |
+
word_length = len(word)
|
427 |
+
if current_length + word_length + 1 <= max_length:
|
428 |
+
current_chunk.append(word)
|
429 |
+
current_length += word_length + 1
|
430 |
+
else:
|
431 |
+
if current_chunk:
|
432 |
+
chunks.append(' '.join(current_chunk))
|
433 |
+
current_chunk = [word]
|
434 |
+
current_length = word_length
|
435 |
+
|
436 |
+
if current_chunk:
|
437 |
+
chunks.append(' '.join(current_chunk))
|
438 |
+
|
439 |
+
return chunks
|
440 |
+
|
441 |
+
def _translate_chunk_with_retries(self, chunk: str, max_retries: int = 3) -> str:
|
442 |
+
"""Attempt translation with multiple fallback options"""
|
443 |
+
if not chunk or not chunk.strip():
|
444 |
+
return ""
|
445 |
+
|
446 |
+
for attempt in range(max_retries):
|
447 |
+
try:
|
448 |
+
# First try Helsinki NLP
|
449 |
+
result = self.translator(chunk, max_length=512)
|
450 |
+
if result and isinstance(result, list) and len(result) > 0:
|
451 |
+
translated = result[0].get('translation_text')
|
452 |
+
if translated and isinstance(translated, str):
|
453 |
+
return translated
|
454 |
+
|
455 |
+
# First fallback: Google Translator
|
456 |
+
translated = self.fallback_translator.translate(chunk)
|
457 |
+
if translated and isinstance(translated, str):
|
458 |
+
return translated
|
459 |
+
|
460 |
+
# Second fallback: Legacy Google Translator
|
461 |
+
translated = self.legacy_translator.translate(chunk, src='ru', dest='en').text
|
462 |
+
if translated and isinstance(translated, str):
|
463 |
+
return translated
|
464 |
+
|
465 |
+
except Exception as e:
|
466 |
+
if attempt == max_retries - 1:
|
467 |
+
st.warning(f"Попробовал перевести {max_retries} раз, не преуспел: {str(e)}")
|
468 |
+
time.sleep(1 * (attempt + 1)) # Exponential backoff
|
469 |
+
|
470 |
+
return chunk # Return original text if all translation attempts fail
|
471 |
+
|
472 |
+
def translate_text(self, text: str) -> str:
|
473 |
+
"""Translate text with robust error handling and validation"""
|
474 |
+
# Input validation
|
475 |
+
if pd.isna(text) or not isinstance(text, str):
|
476 |
return str(text) if pd.notna(text) else ""
|
477 |
+
|
478 |
text = str(text).strip()
|
479 |
if not text:
|
480 |
return ""
|
481 |
+
|
482 |
try:
|
483 |
+
# Split into manageable chunks
|
484 |
+
chunks = self._split_into_chunks(text)
|
485 |
translated_chunks = []
|
486 |
+
|
487 |
+
# Process each chunk with validation
|
488 |
for chunk in chunks:
|
489 |
if not chunk.strip():
|
490 |
continue
|
491 |
+
|
492 |
+
translated_chunk = self._translate_chunk_with_retries(chunk)
|
493 |
+
if translated_chunk: # Only add non-empty translations
|
494 |
+
translated_chunks.append(translated_chunk)
|
495 |
+
time.sleep(0.1) # Rate limiting
|
496 |
+
|
497 |
+
# Final validation of results
|
498 |
+
if not translated_chunks:
|
499 |
+
return text # Return original if no translations succeeded
|
500 |
+
|
501 |
+
result = ' '.join(translated_chunks)
|
502 |
+
return result if result.strip() else text
|
503 |
+
|
|
|
504 |
except Exception as e:
|
505 |
st.warning(f"Translation error: {str(e)}")
|
506 |
+
return text # Return original text on error
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
507 |
|
508 |
|
509 |
|
|
|
998 |
st.set_page_config(layout="wide")
|
999 |
|
1000 |
with st.sidebar:
|
1001 |
+
st.title("::: AI-анализ мониторинга новостей (v.3.57):::")
|
1002 |
st.subheader("по материалам СКАН-ИНТЕРФАКС")
|
1003 |
|
1004 |
model_choice = st.radio(
|