pentarosarium commited on
Commit
4feef77
·
1 Parent(s): e20a82b
Files changed (1) hide show
  1. app.py +107 -76
app.py CHANGED
@@ -28,6 +28,19 @@ def fuzzy_deduplicate(df, column, threshold=55):
28
  logging.basicConfig(level=logging.INFO)
29
  logger = logging.getLogger(__name__)
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  class EventDetector:
32
  def __init__(self):
33
  self.model_name = "google/mt5-small"
@@ -36,60 +49,16 @@ class EventDetector:
36
  self.finbert = None
37
  self.roberta = None
38
  self.finbert_tone = None
39
-
40
- @spaces.GPU
41
- def initialize_models(self):
42
- try:
43
- device = "cuda" if torch.cuda.is_available() else "cpu"
44
- logger.info(f"Initializing models on device: {device}")
45
-
46
- self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name).to(device)
47
- self.finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert", device=device)
48
- self.roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment", device=device)
49
- self.finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone", device=device)
50
-
51
- return True
52
- except Exception as e:
53
- logger.error(f"Model initialization error: {str(e)}")
54
- return False
55
 
56
- @spaces.GPU
57
- def detect_events(self, text, entity):
58
- if not text or not entity:
59
- return "Нет", "Invalid input"
60
-
61
- try:
62
- if self.model is None:
63
- if not self.initialize_models():
64
- return "Нет", "Model initialization failed"
65
-
66
- device = "cuda" if torch.cuda.is_available() else "cpu"
67
- # Truncate input text to avoid tensor size mismatch
68
- text = text[:500] # Adjust this value if needed
69
-
70
- prompt = f"""<s>Analyze the following news about {entity}:
71
- Text: {text}
72
- Task: Identify the main event type and provide a brief summary.</s>"""
73
-
74
- inputs = self.tokenizer(prompt, return_tensors="pt", padding=True,
75
- truncation=True, max_length=512).to(device)
76
-
77
- outputs = self.model.generate(**inputs, max_length=300, num_return_sequences=1)
78
- response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
79
-
80
- event_type = "Нет"
81
- if any(term in text.lower() for term in ['отчет', 'выручка', 'прибыль', 'ebitda']):
82
- event_type = "Отчетность"
83
- elif any(term in text.lower() for term in ['облигаци', 'купон', 'дефолт']):
84
- event_type = "РЦБ"
85
- elif any(term in text.lower() for term in ['суд', 'иск', 'арбитраж']):
86
- event_type = "Суд"
87
-
88
- return event_type, response
89
-
90
- except Exception as e:
91
- logger.error(f"Event detection error: {str(e)}")
92
- return "Нет", f"Error: {str(e)}"
93
 
94
  @spaces.GPU
95
  def analyze_sentiment(self, text):
@@ -98,21 +67,19 @@ class EventDetector:
98
  if not self.initialize_models():
99
  return "Neutral"
100
 
101
- # Truncate text to avoid tensor size issues
102
  truncated_text = text[:500]
103
-
104
  results = []
 
105
  try:
106
- # Process text with all models in a batch
107
  inputs = [truncated_text]
108
  finbert_result = self.finbert(inputs, truncation=True, max_length=512)[0]
109
  roberta_result = self.roberta(inputs, truncation=True, max_length=512)[0]
110
  finbert_tone_result = self.finbert_tone(inputs, truncation=True, max_length=512)[0]
111
 
112
  results = [
113
- self._get_sentiment(finbert_result),
114
- self._get_sentiment(roberta_result),
115
- self._get_sentiment(finbert_tone_result)
116
  ]
117
 
118
  except Exception as e:
@@ -212,8 +179,10 @@ def process_file(file_obj):
212
  raise
213
 
214
  def create_interface():
 
 
215
  with gr.Blocks(theme=gr.themes.Soft()) as app:
216
- gr.Markdown("# AI-анализ мониторинга новостей v.1.11")
217
 
218
  with gr.Row():
219
  file_input = gr.File(
@@ -223,10 +192,17 @@ def create_interface():
223
  )
224
 
225
  with gr.Row():
226
- analyze_btn = gr.Button(
227
- "Начать анализ",
228
- variant="primary"
229
- )
 
 
 
 
 
 
 
230
 
231
  with gr.Row():
232
  progress = gr.Textbox(
@@ -248,35 +224,89 @@ def create_interface():
248
  with gr.Column():
249
  events_plot = gr.Plot(label="Распределение событий")
250
 
 
 
 
 
251
  def analyze(file_bytes):
252
  if file_bytes is None:
253
  gr.Warning("Пожалуйста, загрузите файл")
254
  return None, None, None, "Ожидание файла..."
255
 
256
  try:
257
- # Create BytesIO object and debug print its content
 
 
258
  file_obj = io.BytesIO(file_bytes)
259
  logger.info("File loaded into BytesIO successfully")
260
 
261
- # Process file with progress updates
262
  progress_status = "Начинаем обработку файла..."
263
  yield None, None, None, progress_status
264
 
265
- df = process_file(file_obj)
 
 
266
 
267
- if df.empty:
268
- return None, None, None, "Нет данных для обработки"
 
 
269
 
270
- progress_status = f"Создание визуализаций..."
271
- yield None, None, None, progress_status
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
- fig_sentiment, fig_events = create_visualizations(df)
 
274
 
275
  return (
276
- df,
277
- fig_sentiment,
278
- fig_events,
279
- f"Обработка завершена успешно! Обработано {len(df)} строк"
280
  )
281
 
282
  except Exception as e:
@@ -285,6 +315,7 @@ def create_interface():
285
  gr.Error(error_msg)
286
  return None, None, None, error_msg
287
 
 
288
  analyze_btn.click(
289
  fn=analyze,
290
  inputs=[file_input],
 
28
  logging.basicConfig(level=logging.INFO)
29
  logger = logging.getLogger(__name__)
30
 
31
+ class ProcessControl:
32
+ def __init__(self):
33
+ self.stop_requested = False
34
+
35
+ def request_stop(self):
36
+ self.stop_requested = True
37
+
38
+ def should_stop(self):
39
+ return self.stop_requested
40
+
41
+ def reset(self):
42
+ self.stop_requested = False
43
+
44
  class EventDetector:
45
  def __init__(self):
46
  self.model_name = "google/mt5-small"
 
49
  self.finbert = None
50
  self.roberta = None
51
  self.finbert_tone = None
52
+ self.control = ProcessControl()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ def get_sentiment_label(self, result):
55
+ """Helper method for sentiment classification"""
56
+ label = result['label'].lower()
57
+ if label in ["positive", "label_2", "pos"]:
58
+ return "Positive"
59
+ elif label in ["negative", "label_0", "neg"]:
60
+ return "Negative"
61
+ return "Neutral"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  @spaces.GPU
64
  def analyze_sentiment(self, text):
 
67
  if not self.initialize_models():
68
  return "Neutral"
69
 
 
70
  truncated_text = text[:500]
 
71
  results = []
72
+
73
  try:
 
74
  inputs = [truncated_text]
75
  finbert_result = self.finbert(inputs, truncation=True, max_length=512)[0]
76
  roberta_result = self.roberta(inputs, truncation=True, max_length=512)[0]
77
  finbert_tone_result = self.finbert_tone(inputs, truncation=True, max_length=512)[0]
78
 
79
  results = [
80
+ self.get_sentiment_label(finbert_result),
81
+ self.get_sentiment_label(roberta_result),
82
+ self.get_sentiment_label(finbert_tone_result)
83
  ]
84
 
85
  except Exception as e:
 
179
  raise
180
 
181
  def create_interface():
182
+ control = ProcessControl()
183
+
184
  with gr.Blocks(theme=gr.themes.Soft()) as app:
185
+ gr.Markdown("# AI-анализ мониторинга новостей v.1.12")
186
 
187
  with gr.Row():
188
  file_input = gr.File(
 
192
  )
193
 
194
  with gr.Row():
195
+ col1, col2 = gr.Columns(2)
196
+ with col1:
197
+ analyze_btn = gr.Button(
198
+ "Начать анализ",
199
+ variant="primary"
200
+ )
201
+ with col2:
202
+ stop_btn = gr.Button(
203
+ "❌ Остановить",
204
+ variant="stop"
205
+ )
206
 
207
  with gr.Row():
208
  progress = gr.Textbox(
 
224
  with gr.Column():
225
  events_plot = gr.Plot(label="Распределение событий")
226
 
227
+ def stop_processing():
228
+ control.request_stop()
229
+ return "Остановка обработки..."
230
+
231
  def analyze(file_bytes):
232
  if file_bytes is None:
233
  gr.Warning("Пожалуйста, загрузите файл")
234
  return None, None, None, "Ожидание файла..."
235
 
236
  try:
237
+ # Reset stop flag
238
+ control.reset()
239
+
240
  file_obj = io.BytesIO(file_bytes)
241
  logger.info("File loaded into BytesIO successfully")
242
 
 
243
  progress_status = "Начинаем обработку файла..."
244
  yield None, None, None, progress_status
245
 
246
+ # Process file
247
+ df = pd.read_excel(file_obj, sheet_name='Публикации')
248
+ logger.info(f"Successfully read Excel file. Shape: {df.shape}")
249
 
250
+ # Deduplication
251
+ original_count = len(df)
252
+ df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
253
+ logger.info(f"Removed {original_count - len(df)} duplicate entries")
254
 
255
+ detector = EventDetector()
256
+ detector.control = control # Pass control object
257
+ processed_rows = []
258
+ total = len(df)
259
+
260
+ # Initialize models
261
+ if not detector.initialize_models():
262
+ raise Exception("Failed to initialize models")
263
+
264
+ for idx, row in df.iterrows():
265
+ if control.should_stop():
266
+ yield (
267
+ pd.DataFrame(processed_rows) if processed_rows else None,
268
+ None, None,
269
+ f"Обработка остановлена. Обработано {idx} из {total} строк"
270
+ )
271
+ return
272
+
273
+ try:
274
+ text = str(row.get('Выдержки из текста', ''))
275
+ if not text.strip():
276
+ continue
277
+
278
+ entity = str(row.get('Объект', ''))
279
+ if not entity.strip():
280
+ continue
281
+
282
+ event_type, event_summary = detector.detect_events(text, entity)
283
+ sentiment = detector.analyze_sentiment(text)
284
+
285
+ processed_rows.append({
286
+ 'Объект': entity,
287
+ 'Заголовок': str(row.get('Заголовок', '')),
288
+ 'Sentiment': sentiment,
289
+ 'Event_Type': event_type,
290
+ 'Event_Summary': event_summary,
291
+ 'Текст': text[:1000]
292
+ })
293
+
294
+ if idx % 5 == 0:
295
+ progress_status = f"Обработано {idx + 1}/{total} строк"
296
+ yield None, None, None, progress_status
297
+
298
+ except Exception as e:
299
+ logger.error(f"Error processing row {idx}: {str(e)}")
300
+ continue
301
 
302
+ result_df = pd.DataFrame(processed_rows)
303
+ fig_sentiment, fig_events = create_visualizations(result_df)
304
 
305
  return (
306
+ result_df,
307
+ fig_sentiment,
308
+ fig_events,
309
+ f"Обработка завершена успешно! Обработано {len(result_df)} строк"
310
  )
311
 
312
  except Exception as e:
 
315
  gr.Error(error_msg)
316
  return None, None, None, error_msg
317
 
318
+ stop_btn.click(fn=stop_processing, outputs=[progress])
319
  analyze_btn.click(
320
  fn=analyze,
321
  inputs=[file_input],