pentarosarium commited on
Commit
913a17b
·
1 Parent(s): 0cc6350

3.38 hard coded trans

Browse files
Files changed (1) hide show
  1. app.py +71 -193
app.py CHANGED
@@ -22,177 +22,54 @@ from typing import Optional
22
  from deep_translator import GoogleTranslator as DeepGoogleTranslator
23
  from googletrans import Translator as LegacyTranslator
24
 
 
25
  class TranslationSystem:
26
- def __init__(self, method='auto', llm=None, batch_size=10):
27
  """
28
- Initialize translation system with multiple fallback options.
29
-
30
- Args:
31
- method: str - Translation method to use ('auto', 'deep-google', or 'llm')
32
- llm: Optional LangChain LLM instance
33
- batch_size: int - Number of texts to process in each batch
34
  """
35
- self.method = method
36
- self.llm = llm
37
  self.batch_size = batch_size
38
- self.translator = None
39
- self._initialize_translator()
40
-
41
- def _initialize_translator(self):
42
- if self.method == 'llm':
43
- if not self.llm:
44
- raise Exception("LLM must be provided when using 'llm' method")
45
- return
46
-
47
- try:
48
- # Try deep-translator first
49
- self.translator = DeepGoogleTranslator()
50
- self.method = 'deep-google'
51
- # Test translation
52
- test_result = self.translator.translate(text='test', source='ru', target='en')
53
- if not test_result:
54
- raise Exception("Deep translator test failed")
55
-
56
- except Exception as deep_e:
57
- st.warning(f"Deep-translator initialization failed: {str(deep_e)}")
58
-
59
- if self.llm:
60
- st.info("Falling back to LLM translation")
61
- self.method = 'llm'
62
- else:
63
- raise Exception("No translation method available")
64
-
65
- def translate_batch(self, texts, src='ru', dest='en'):
66
- """
67
- Translate a batch of texts with fallback options.
68
- """
69
- translations = []
70
- for i in range(0, len(texts), self.batch_size):
71
- batch = texts[i:i + self.batch_size]
72
- batch_translations = []
73
-
74
- for text in batch:
75
- try:
76
- if not isinstance(text, str):
77
- batch_translations.append(str(text))
78
- continue
79
-
80
- translation = self._translate_single_text(text, src, dest)
81
- batch_translations.append(translation)
82
-
83
- except Exception as e:
84
- st.warning(f"Translation error: {str(e)}. Using original text.")
85
- batch_translations.append(text)
86
-
87
- # Try LLM fallback if available
88
- if self.method != 'llm' and self.llm:
89
- try:
90
- st.info("Attempting LLM translation fallback...")
91
- temp_method = self.method
92
- self.method = 'llm'
93
- translation = self._translate_single_text(text, src, dest)
94
- batch_translations[-1] = translation
95
- self.method = temp_method
96
- except Exception as llm_e:
97
- st.warning(f"LLM fallback failed: {str(llm_e)}")
98
-
99
- translations.extend(batch_translations)
100
- time.sleep(1)
101
-
102
- return translations
103
 
104
- def _translate_single_text(self, text, src='ru', dest='en'):
105
  """
106
- Translate a single text with appropriate method.
107
  """
108
  if pd.isna(text) or not isinstance(text, str) or not text.strip():
109
  return text
110
 
111
- text = text.strip()
112
-
113
- if self.method == 'llm':
114
- return self._translate_with_llm(text, src, dest)
115
- elif self.method == 'deep-google':
116
- return self._translate_with_deep_google(text, src, dest)
117
- else:
118
- raise Exception(f"Unsupported translation method: {self.method}")
119
 
120
- def _translate_with_deep_google(self, text, src='ru', dest='en'):
121
- """
122
- Translate using deep-translator's Google Translate.
123
- """
124
  try:
125
- # deep-translator uses different language codes
126
- src = 'auto' if src == 'auto' else src.lower()
127
- dest = dest.lower()
128
 
129
- # Split long texts (deep-translator has a character limit)
130
- max_length = 5000
131
- if len(text) > max_length:
132
- chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
133
- translated_chunks = []
134
- for chunk in chunks:
135
- translated_chunk = self.translator.translate(
136
- text=chunk,
137
- source=src,
138
- target=dest
139
- )
140
- translated_chunks.append(translated_chunk)
141
- return ' '.join(translated_chunks)
142
- else:
143
- return self.translator.translate(
144
- text=text,
145
- source=src,
146
- target=dest
147
- )
148
-
149
- except Exception as e:
150
- raise Exception(f"Deep-translator error: {str(e)}")
151
 
152
- def _translate_with_llm(self, text, src='ru', dest='en'):
153
- """
154
- Translate using LangChain LLM.
155
- """
156
- if not self.llm:
157
- raise Exception("LLM not initialized for translation")
158
 
159
- messages = [
160
- {"role": "system", "content": "You are a translator. Translate the given text accurately and concisely."},
161
- {"role": "user", "content": f"Translate this text from {src} to {dest}: {text}"}
162
- ]
163
-
164
- response = self.llm.invoke(messages)
165
- return response.content.strip() if hasattr(response, 'content') else str(response).strip()
166
-
167
- def init_translation_system(model_choice, translation_method='auto'):
168
- """
169
- Initialize translation system with appropriate configuration.
170
- """
171
- llm = init_langchain_llm(model_choice) if translation_method != 'deep-google' else None
172
-
173
- try:
174
- translator = TranslationSystem(
175
- method=translation_method,
176
- llm=llm,
177
- batch_size=5
178
- )
179
- return translator
180
- except Exception as e:
181
- st.error(f"Failed to initialize translation system: {str(e)}")
182
- raise
183
 
184
- def process_file(uploaded_file, model_choice, translation_method='auto'):
185
  df = None
186
  try:
187
  df = pd.read_excel(uploaded_file, sheet_name='Публикации')
188
  llm = init_langchain_llm(model_choice)
189
-
190
- # Initialize translation system
191
- translator = TranslationSystem(
192
- method=translation_method, # Remove quotes from parameter name
193
- llm=llm,
194
- batch_size=5
195
- )
196
 
197
  # Validate required columns
198
  required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
@@ -228,52 +105,53 @@ def process_file(uploaded_file, model_choice, translation_method='auto'):
228
  for i in range(0, len(df), batch_size):
229
  batch_df = df.iloc[i:i+batch_size]
230
 
231
- try:
232
- # Translate batch
233
- texts_to_translate = batch_df['Выдержки из текста'].tolist()
234
- translations = translator.translate_batch(texts_to_translate)
235
- df.loc[df.index[i:i+batch_size], 'Translated'] = translations
236
-
237
- # Process each item in batch
238
- for j, (idx, row) in enumerate(batch_df.iterrows()):
239
- try:
240
- # Analyze sentiment with rate limit handling
241
- sentiment = analyze_sentiment(translations[j])
242
- df.at[idx, 'Sentiment'] = sentiment
243
-
244
- # Detect events with rate limit handling
245
- event_type, event_summary = detect_events(
 
 
 
 
 
 
246
  llm,
247
- row['Выдержки из текста'],
248
  row['Объект']
249
  )
250
- df.at[idx, 'Event_Type'] = event_type
251
- df.at[idx, 'Event_Summary'] = event_summary
252
-
253
- if sentiment == "Negative":
254
- impact, reasoning = estimate_impact(
255
- llm,
256
- translations[j],
257
- row['Объект']
258
- )
259
- df.at[idx, 'Impact'] = impact
260
- df.at[idx, 'Reasoning'] = reasoning
261
-
262
- # Update progress
263
- progress = (i + j + 1) / len(df)
264
- progress_bar.progress(progress)
265
- status_text.text(f"Проанализировано {i + j + 1} из {len(df)} новостей")
266
-
267
- except Exception as e:
268
- st.warning(f"Ошибка при обработке новости {idx + 1}: {str(e)}")
269
- continue
270
 
271
- # Add delay between batches to avoid rate limits
272
- time.sleep(2)
 
 
 
 
 
 
273
 
274
- except Exception as e:
275
- st.warning(f"Ошибка при обработке батча {i//batch_size + 1}: {str(e)}")
276
- continue
 
 
277
 
278
  return df
279
 
@@ -677,7 +555,7 @@ def create_output_file(df, uploaded_file, llm):
677
 
678
  def main():
679
  with st.sidebar:
680
- st.title("::: AI-анализ мониторинга новостей (v.3.37 ):::")
681
  st.subheader("по материалам СКАН-ИНТЕРФАКС ")
682
 
683
  model_choice = st.radio(
 
22
  from deep_translator import GoogleTranslator as DeepGoogleTranslator
23
  from googletrans import Translator as LegacyTranslator
24
 
25
+
26
  class TranslationSystem:
27
+ def __init__(self, batch_size=5):
28
  """
29
+ Initialize translation system using only deep-translator.
 
 
 
 
 
30
  """
 
 
31
  self.batch_size = batch_size
32
+ self.translator = GoogleTranslator(source='russian', target='english') # Using full language names
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ def translate_text(self, text):
35
  """
36
+ Translate single text using deep-translator with chunking for long texts.
37
  """
38
  if pd.isna(text) or not isinstance(text, str) or not text.strip():
39
  return text
40
 
41
+ text = str(text).strip()
42
+ if not text:
43
+ return text
 
 
 
 
 
44
 
 
 
 
 
45
  try:
46
+ # deep-translator has a character limit, so we need to chunk long texts
47
+ max_chunk_size = 4500 # Deep translator limit is 5000, using 4500 to be safe
 
48
 
49
+ if len(text) <= max_chunk_size:
50
+ return self.translator.translate(text=text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ # Split long text into chunks
53
+ chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
54
+ translated_chunks = []
 
 
 
55
 
56
+ for chunk in chunks:
57
+ translated_chunk = self.translator.translate(text=chunk)
58
+ translated_chunks.append(translated_chunk)
59
+ time.sleep(0.5) # Small delay between chunks
60
+
61
+ return ' '.join(translated_chunks)
62
+
63
+ except Exception as e:
64
+ st.warning(f"Translation error: {str(e)}. Using original text.")
65
+ return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ def process_file(uploaded_file, model_choice):
68
  df = None
69
  try:
70
  df = pd.read_excel(uploaded_file, sheet_name='Публикации')
71
  llm = init_langchain_llm(model_choice)
72
+ translator = TranslationSystem(batch_size=5)
 
 
 
 
 
 
73
 
74
  # Validate required columns
75
  required_columns = ['Объект', 'Заголовок', 'Выдержки из текста']
 
105
  for i in range(0, len(df), batch_size):
106
  batch_df = df.iloc[i:i+batch_size]
107
 
108
+ for idx, row in batch_df.iterrows():
109
+ try:
110
+ # Translation
111
+ translated_text = translator.translate_text(row['Выдержки из текста'])
112
+ df.at[idx, 'Translated'] = translated_text
113
+
114
+ # Sentiment analysis
115
+ sentiment = analyze_sentiment(translated_text)
116
+ df.at[idx, 'Sentiment'] = sentiment
117
+
118
+ # Event detection
119
+ event_type, event_summary = detect_events(
120
+ llm,
121
+ row['Выдержки из текста'],
122
+ row['Объект']
123
+ )
124
+ df.at[idx, 'Event_Type'] = event_type
125
+ df.at[idx, 'Event_Summary'] = event_summary
126
+
127
+ if sentiment == "Negative":
128
+ impact, reasoning = estimate_impact(
129
  llm,
130
+ translated_text,
131
  row['Объект']
132
  )
133
+ df.at[idx, 'Impact'] = impact
134
+ df.at[idx, 'Reasoning'] = reasoning
135
+
136
+ # Update progress
137
+ progress = (idx + 1) / len(df)
138
+ progress_bar.progress(progress)
139
+ status_text.text(f"Проанализировано {idx + 1} из {len(df)} новостей")
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
+ except Exception as e:
142
+ if 'rate limit' in str(e).lower():
143
+ wait_time = 240 # 4 minutes wait for rate limit
144
+ st.warning(f"Rate limit reached. Waiting {wait_time} seconds...")
145
+ time.sleep(wait_time)
146
+ continue
147
+ st.warning(f"Ошибка при обработке новости {idx + 1}: {str(e)}")
148
+ continue
149
 
150
+ # Small delay between items to avoid rate limits
151
+ time.sleep(0.5)
152
+
153
+ # Delay between batches
154
+ time.sleep(2)
155
 
156
  return df
157
 
 
555
 
556
  def main():
557
  with st.sidebar:
558
+ st.title("::: AI-анализ мониторинга новостей (v.3.38 ):::")
559
  st.subheader("по материалам СКАН-ИНТЕРФАКС ")
560
 
561
  model_choice = st.radio(