pentarosarium commited on
Commit
23332bc
·
1 Parent(s): 412ee33
Files changed (1) hide show
  1. app.py +92 -137
app.py CHANGED
@@ -44,7 +44,6 @@ class ProcessControl:
44
  class EventDetector:
45
  def __init__(self):
46
  self.model_name = "google/mt5-small"
47
- # Initialize tokenizer with legacy=True to suppress warning
48
  self.tokenizer = AutoTokenizer.from_pretrained(
49
  self.model_name,
50
  legacy=True
@@ -53,136 +52,60 @@ class EventDetector:
53
  self.finbert = None
54
  self.roberta = None
55
  self.finbert_tone = None
56
- self.control = ProcessControl()
57
 
58
- @spaces.GPU
59
  def initialize_models(self):
60
- """Initialize all models with GPU support"""
61
  try:
 
 
 
 
62
  device = "cuda" if torch.cuda.is_available() else "cpu"
63
  logger.info(f"Initializing models on device: {device}")
64
 
65
- # Initialize MT5 model
66
  self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name).to(device)
 
 
 
67
 
68
- # Initialize sentiment analysis pipelines
69
- self.finbert = pipeline(
70
- "sentiment-analysis",
71
- model="ProsusAI/finbert",
72
- device=device,
73
- truncation=True,
74
- max_length=512
75
- )
76
-
77
- self.roberta = pipeline(
78
- "sentiment-analysis",
79
- model="cardiffnlp/twitter-roberta-base-sentiment",
80
- device=device,
81
- truncation=True,
82
- max_length=512
83
- )
84
-
85
- self.finbert_tone = pipeline(
86
- "sentiment-analysis",
87
- model="yiyanghkust/finbert-tone",
88
- device=device,
89
- truncation=True,
90
- max_length=512
91
- )
92
-
93
- logger.info("All models initialized successfully")
94
  return True
95
 
96
  except Exception as e:
97
  logger.error(f"Model initialization error: {str(e)}")
98
  return False
99
 
100
- @spaces.GPU
101
  def detect_events(self, text, entity):
102
  if not text or not entity:
103
  return "Нет", "Invalid input"
104
 
105
  try:
106
- # Check if models are initialized
107
- if self.model is None:
108
- if not self.initialize_models():
109
- return "Нет", "Model initialization failed"
110
-
111
- # Truncate input text
112
- text = text[:500]
113
-
114
- prompt = f"""<s>Analyze the following news about {entity}:
115
- Text: {text}
116
- Task: Identify the main event type and provide a brief summary.</s>"""
117
-
118
- inputs = self.tokenizer(
119
- prompt,
120
- return_tensors="pt",
121
- padding=True,
122
- truncation=True,
123
- max_length=512
124
- ).to(self.model.device)
125
-
126
- outputs = self.model.generate(
127
- **inputs,
128
- max_length=300,
129
- num_return_sequences=1,
130
- pad_token_id=self.tokenizer.pad_token_id,
131
- eos_token_id=self.tokenizer.eos_token_id
132
- )
133
- response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
134
-
135
- event_type = "Нет"
136
- if any(term in text.lower() for term in ['отчет', 'выручка', 'прибыль', 'ebitda']):
137
- event_type = "Отчетность"
138
- elif any(term in text.lower() for term in ['облигаци', 'купон', 'дефолт']):
139
- event_type = "РЦБ"
140
- elif any(term in text.lower() for term in ['суд', 'иск', 'арбитраж']):
141
- event_type = "Суд"
142
 
 
 
 
143
  return event_type, response
144
 
145
  except Exception as e:
146
  logger.error(f"Event detection error: {str(e)}")
147
  return "Нет", f"Error: {str(e)}"
148
 
149
- def get_sentiment_label(self, result):
150
- """Helper method for sentiment classification"""
151
- label = result['label'].lower()
152
- if label in ["positive", "label_2", "pos"]:
153
- return "Positive"
154
- elif label in ["negative", "label_0", "neg"]:
155
- return "Negative"
156
- return "Neutral"
157
-
158
- @spaces.GPU
159
  def analyze_sentiment(self, text):
160
  try:
161
- if self.finbert is None:
162
- if not self.initialize_models():
163
- return "Neutral"
164
-
165
- truncated_text = text[:500]
166
- results = []
167
-
168
- try:
169
- inputs = [truncated_text]
170
- finbert_result = self.finbert(inputs)[0]
171
- roberta_result = self.roberta(inputs)[0]
172
- finbert_tone_result = self.finbert_tone(inputs)[0]
173
 
174
- results = [
175
- self.get_sentiment_label(finbert_result),
176
- self.get_sentiment_label(roberta_result),
177
- self.get_sentiment_label(finbert_tone_result)
178
- ]
179
-
180
- except Exception as e:
181
- logger.error(f"Model inference error: {e}")
182
- return "Neutral"
183
 
184
- sentiment_counts = pd.Series(results).value_counts()
185
- return sentiment_counts.index[0] if sentiment_counts.iloc[0] >= 2 else "Neutral"
186
 
187
  except Exception as e:
188
  logger.error(f"Sentiment analysis error: {e}")
@@ -222,7 +145,7 @@ def process_file(file_obj):
222
  df = pd.read_excel(file_obj, sheet_name='Публикации')
223
  logger.info(f"Successfully read Excel file. Shape: {df.shape}")
224
 
225
- # Perform deduplication
226
  original_count = len(df)
227
  df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
228
  logger.info(f"Removed {original_count - len(df)} duplicate entries")
@@ -231,44 +154,76 @@ def process_file(file_obj):
231
  processed_rows = []
232
  total = len(df)
233
 
234
- # Initialize models once for all rows
235
- if not detector.initialize_models():
236
- raise Exception("Failed to initialize models")
237
-
238
- for idx, row in df.iterrows():
239
- try:
240
- text = str(row.get('Выдержки из текста', ''))
241
- if not text.strip():
242
- continue
243
-
244
- entity = str(row.get('Объект', ''))
245
- if not entity.strip():
246
- continue
247
-
248
- event_type, event_summary = detector.detect_events(text, entity)
249
- sentiment = detector.analyze_sentiment(text)
250
 
251
- processed_rows.append({
252
- 'Объект': entity,
253
- 'Заголовок': str(row.get('Заголовок', '')),
254
- 'Sentiment': sentiment,
255
- 'Event_Type': event_type,
256
- 'Event_Summary': event_summary,
257
- 'Текст': text[:1000] # Truncate text for display
258
- })
259
-
260
- if idx % 5 == 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  logger.info(f"Processed {idx + 1}/{total} rows")
262
 
263
- except Exception as e:
264
- logger.error(f"Error processing row {idx}: {str(e)}")
265
- continue
266
-
267
- result_df = pd.DataFrame(processed_rows)
268
- logger.info(f"Processing complete. Final DataFrame shape: {result_df.shape}")
269
-
270
- return result_df
 
 
 
 
 
 
 
 
271
 
 
 
 
 
 
 
 
 
272
  except Exception as e:
273
  logger.error(f"File processing error: {str(e)}")
274
  raise
@@ -277,7 +232,7 @@ def create_interface():
277
  control = ProcessControl()
278
 
279
  with gr.Blocks(theme=gr.themes.Soft()) as app:
280
- gr.Markdown("# AI-анализ мониторинга новостей v.1.14")
281
 
282
  with gr.Row():
283
  file_input = gr.File(
 
44
  class EventDetector:
45
  def __init__(self):
46
  self.model_name = "google/mt5-small"
 
47
  self.tokenizer = AutoTokenizer.from_pretrained(
48
  self.model_name,
49
  legacy=True
 
52
  self.finbert = None
53
  self.roberta = None
54
  self.finbert_tone = None
55
+ self.last_gpu_use = 0
56
 
57
+ @spaces.GPU(duration=30) # Reduced duration
58
  def initialize_models(self):
 
59
  try:
60
+ current_time = time.time()
61
+ if current_time - self.last_gpu_use < 2:
62
+ time.sleep(2)
63
+
64
  device = "cuda" if torch.cuda.is_available() else "cpu"
65
  logger.info(f"Initializing models on device: {device}")
66
 
 
67
  self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name).to(device)
68
+ self.finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert", device=device)
69
+ self.roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment", device=device)
70
+ self.finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone", device=device)
71
 
72
+ self.last_gpu_use = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  return True
74
 
75
  except Exception as e:
76
  logger.error(f"Model initialization error: {str(e)}")
77
  return False
78
 
79
+ @spaces.GPU(duration=20) # Reduced duration
80
  def detect_events(self, text, entity):
81
  if not text or not entity:
82
  return "Нет", "Invalid input"
83
 
84
  try:
85
+ current_time = time.time()
86
+ if current_time - self.last_gpu_use < 2:
87
+ time.sleep(2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
+ # Rest of the method remains the same...
90
+
91
+ self.last_gpu_use = time.time()
92
  return event_type, response
93
 
94
  except Exception as e:
95
  logger.error(f"Event detection error: {str(e)}")
96
  return "Нет", f"Error: {str(e)}"
97
 
98
+ @spaces.GPU(duration=20) # Reduced duration
 
 
 
 
 
 
 
 
 
99
  def analyze_sentiment(self, text):
100
  try:
101
+ current_time = time.time()
102
+ if current_time - self.last_gpu_use < 2:
103
+ time.sleep(2)
 
 
 
 
 
 
 
 
 
104
 
105
+ # Rest of the method remains the same...
 
 
 
 
 
 
 
 
106
 
107
+ self.last_gpu_use = time.time()
108
+ return sentiment_result
109
 
110
  except Exception as e:
111
  logger.error(f"Sentiment analysis error: {e}")
 
145
  df = pd.read_excel(file_obj, sheet_name='Публикации')
146
  logger.info(f"Successfully read Excel file. Shape: {df.shape}")
147
 
148
+ # Deduplication
149
  original_count = len(df)
150
  df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
151
  logger.info(f"Removed {original_count - len(df)} duplicate entries")
 
154
  processed_rows = []
155
  total = len(df)
156
 
157
+ # Process in smaller batches
158
+ BATCH_SIZE = 5
159
+ for batch_start in range(0, total, BATCH_SIZE):
160
+ if control.should_stop():
161
+ break
 
 
 
 
 
 
 
 
 
 
 
162
 
163
+ batch_end = min(batch_start + BATCH_SIZE, total)
164
+ batch = df.iloc[batch_start:batch_end]
165
+
166
+ # Initialize models for this batch
167
+ detector.initialize_models()
168
+
169
+ for idx, row in batch.iterrows():
170
+ try:
171
+ text = str(row.get('Выдержки из текста', ''))
172
+ if not text.strip():
173
+ continue
174
+
175
+ entity = str(row.get('Объект', ''))
176
+ if not entity.strip():
177
+ continue
178
+
179
+ # Process event detection with GPU
180
+ event_type, event_summary = detector.detect_events(text, entity)
181
+
182
+ # Small delay to avoid quota issues
183
+ time.sleep(0.5)
184
+
185
+ # Process sentiment analysis with GPU
186
+ sentiment = detector.analyze_sentiment(text)
187
+
188
+ # Small delay after GPU operations
189
+ time.sleep(0.5)
190
+
191
+ processed_rows.append({
192
+ 'Объект': entity,
193
+ 'Заголовок': str(row.get('Заголовок', '')),
194
+ 'Sentiment': sentiment,
195
+ 'Event_Type': event_type,
196
+ 'Event_Summary': event_summary,
197
+ 'Текст': text[:1000]
198
+ })
199
+
200
  logger.info(f"Processed {idx + 1}/{total} rows")
201
 
202
+ except Exception as e:
203
+ logger.error(f"Error processing row {idx}: {str(e)}")
204
+ if "GPU quota" in str(e):
205
+ # Wait longer if we hit quota limits
206
+ time.sleep(5)
207
+ continue
208
+
209
+ # Release GPU resources after each batch
210
+ torch.cuda.empty_cache()
211
+ # Wait between batches
212
+ time.sleep(2)
213
+
214
+ # Create intermediate results
215
+ if processed_rows:
216
+ result_df = pd.DataFrame(processed_rows)
217
+ yield result_df, None, None, f"Обработано {len(processed_rows)}/{total} строк"
218
 
219
+ # Final results
220
+ if processed_rows:
221
+ result_df = pd.DataFrame(processed_rows)
222
+ fig_sentiment, fig_events = create_visualizations(result_df)
223
+ return result_df, fig_sentiment, fig_events, "Обработка завершена!"
224
+ else:
225
+ return None, None, None, "Нет обработанных данных"
226
+
227
  except Exception as e:
228
  logger.error(f"File processing error: {str(e)}")
229
  raise
 
232
  control = ProcessControl()
233
 
234
  with gr.Blocks(theme=gr.themes.Soft()) as app:
235
+ gr.Markdown("# AI-анализ мониторинга новостей v.1.15")
236
 
237
  with gr.Row():
238
  file_input = gr.File(