Spaces:
Sleeping
Sleeping
Commit
·
4feef77
1
Parent(s):
e20a82b
v.1.12
Browse files
app.py
CHANGED
@@ -28,6 +28,19 @@ def fuzzy_deduplicate(df, column, threshold=55):
|
|
28 |
logging.basicConfig(level=logging.INFO)
|
29 |
logger = logging.getLogger(__name__)
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
class EventDetector:
|
32 |
def __init__(self):
|
33 |
self.model_name = "google/mt5-small"
|
@@ -36,60 +49,16 @@ class EventDetector:
|
|
36 |
self.finbert = None
|
37 |
self.roberta = None
|
38 |
self.finbert_tone = None
|
39 |
-
|
40 |
-
@spaces.GPU
|
41 |
-
def initialize_models(self):
|
42 |
-
try:
|
43 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
44 |
-
logger.info(f"Initializing models on device: {device}")
|
45 |
-
|
46 |
-
self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name).to(device)
|
47 |
-
self.finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert", device=device)
|
48 |
-
self.roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment", device=device)
|
49 |
-
self.finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone", device=device)
|
50 |
-
|
51 |
-
return True
|
52 |
-
except Exception as e:
|
53 |
-
logger.error(f"Model initialization error: {str(e)}")
|
54 |
-
return False
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
return "Нет", "Model initialization failed"
|
65 |
-
|
66 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
67 |
-
# Truncate input text to avoid tensor size mismatch
|
68 |
-
text = text[:500] # Adjust this value if needed
|
69 |
-
|
70 |
-
prompt = f"""<s>Analyze the following news about {entity}:
|
71 |
-
Text: {text}
|
72 |
-
Task: Identify the main event type and provide a brief summary.</s>"""
|
73 |
-
|
74 |
-
inputs = self.tokenizer(prompt, return_tensors="pt", padding=True,
|
75 |
-
truncation=True, max_length=512).to(device)
|
76 |
-
|
77 |
-
outputs = self.model.generate(**inputs, max_length=300, num_return_sequences=1)
|
78 |
-
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
79 |
-
|
80 |
-
event_type = "Нет"
|
81 |
-
if any(term in text.lower() for term in ['отчет', 'выручка', 'прибыль', 'ebitda']):
|
82 |
-
event_type = "Отчетность"
|
83 |
-
elif any(term in text.lower() for term in ['облигаци', 'купон', 'дефолт']):
|
84 |
-
event_type = "РЦБ"
|
85 |
-
elif any(term in text.lower() for term in ['суд', 'иск', 'арбитраж']):
|
86 |
-
event_type = "Суд"
|
87 |
-
|
88 |
-
return event_type, response
|
89 |
-
|
90 |
-
except Exception as e:
|
91 |
-
logger.error(f"Event detection error: {str(e)}")
|
92 |
-
return "Нет", f"Error: {str(e)}"
|
93 |
|
94 |
@spaces.GPU
|
95 |
def analyze_sentiment(self, text):
|
@@ -98,21 +67,19 @@ class EventDetector:
|
|
98 |
if not self.initialize_models():
|
99 |
return "Neutral"
|
100 |
|
101 |
-
# Truncate text to avoid tensor size issues
|
102 |
truncated_text = text[:500]
|
103 |
-
|
104 |
results = []
|
|
|
105 |
try:
|
106 |
-
# Process text with all models in a batch
|
107 |
inputs = [truncated_text]
|
108 |
finbert_result = self.finbert(inputs, truncation=True, max_length=512)[0]
|
109 |
roberta_result = self.roberta(inputs, truncation=True, max_length=512)[0]
|
110 |
finbert_tone_result = self.finbert_tone(inputs, truncation=True, max_length=512)[0]
|
111 |
|
112 |
results = [
|
113 |
-
self.
|
114 |
-
self.
|
115 |
-
self.
|
116 |
]
|
117 |
|
118 |
except Exception as e:
|
@@ -212,8 +179,10 @@ def process_file(file_obj):
|
|
212 |
raise
|
213 |
|
214 |
def create_interface():
|
|
|
|
|
215 |
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
216 |
-
gr.Markdown("# AI-анализ мониторинга новостей v.1.
|
217 |
|
218 |
with gr.Row():
|
219 |
file_input = gr.File(
|
@@ -223,10 +192,17 @@ def create_interface():
|
|
223 |
)
|
224 |
|
225 |
with gr.Row():
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
|
231 |
with gr.Row():
|
232 |
progress = gr.Textbox(
|
@@ -248,35 +224,89 @@ def create_interface():
|
|
248 |
with gr.Column():
|
249 |
events_plot = gr.Plot(label="Распределение событий")
|
250 |
|
|
|
|
|
|
|
|
|
251 |
def analyze(file_bytes):
|
252 |
if file_bytes is None:
|
253 |
gr.Warning("Пожалуйста, загрузите файл")
|
254 |
return None, None, None, "Ожидание файла..."
|
255 |
|
256 |
try:
|
257 |
-
#
|
|
|
|
|
258 |
file_obj = io.BytesIO(file_bytes)
|
259 |
logger.info("File loaded into BytesIO successfully")
|
260 |
|
261 |
-
# Process file with progress updates
|
262 |
progress_status = "Начинаем обработку файла..."
|
263 |
yield None, None, None, progress_status
|
264 |
|
265 |
-
|
|
|
|
|
266 |
|
267 |
-
|
268 |
-
|
|
|
|
|
269 |
|
270 |
-
|
271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
|
273 |
-
|
|
|
274 |
|
275 |
return (
|
276 |
-
|
277 |
-
fig_sentiment,
|
278 |
-
fig_events,
|
279 |
-
f"Обработка завершена успешно! Обработано {len(
|
280 |
)
|
281 |
|
282 |
except Exception as e:
|
@@ -285,6 +315,7 @@ def create_interface():
|
|
285 |
gr.Error(error_msg)
|
286 |
return None, None, None, error_msg
|
287 |
|
|
|
288 |
analyze_btn.click(
|
289 |
fn=analyze,
|
290 |
inputs=[file_input],
|
|
|
28 |
logging.basicConfig(level=logging.INFO)
|
29 |
logger = logging.getLogger(__name__)
|
30 |
|
31 |
+
class ProcessControl:
|
32 |
+
def __init__(self):
|
33 |
+
self.stop_requested = False
|
34 |
+
|
35 |
+
def request_stop(self):
|
36 |
+
self.stop_requested = True
|
37 |
+
|
38 |
+
def should_stop(self):
|
39 |
+
return self.stop_requested
|
40 |
+
|
41 |
+
def reset(self):
|
42 |
+
self.stop_requested = False
|
43 |
+
|
44 |
class EventDetector:
|
45 |
def __init__(self):
|
46 |
self.model_name = "google/mt5-small"
|
|
|
49 |
self.finbert = None
|
50 |
self.roberta = None
|
51 |
self.finbert_tone = None
|
52 |
+
self.control = ProcessControl()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
+
def get_sentiment_label(self, result):
|
55 |
+
"""Helper method for sentiment classification"""
|
56 |
+
label = result['label'].lower()
|
57 |
+
if label in ["positive", "label_2", "pos"]:
|
58 |
+
return "Positive"
|
59 |
+
elif label in ["negative", "label_0", "neg"]:
|
60 |
+
return "Negative"
|
61 |
+
return "Neutral"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
@spaces.GPU
|
64 |
def analyze_sentiment(self, text):
|
|
|
67 |
if not self.initialize_models():
|
68 |
return "Neutral"
|
69 |
|
|
|
70 |
truncated_text = text[:500]
|
|
|
71 |
results = []
|
72 |
+
|
73 |
try:
|
|
|
74 |
inputs = [truncated_text]
|
75 |
finbert_result = self.finbert(inputs, truncation=True, max_length=512)[0]
|
76 |
roberta_result = self.roberta(inputs, truncation=True, max_length=512)[0]
|
77 |
finbert_tone_result = self.finbert_tone(inputs, truncation=True, max_length=512)[0]
|
78 |
|
79 |
results = [
|
80 |
+
self.get_sentiment_label(finbert_result),
|
81 |
+
self.get_sentiment_label(roberta_result),
|
82 |
+
self.get_sentiment_label(finbert_tone_result)
|
83 |
]
|
84 |
|
85 |
except Exception as e:
|
|
|
179 |
raise
|
180 |
|
181 |
def create_interface():
|
182 |
+
control = ProcessControl()
|
183 |
+
|
184 |
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
185 |
+
gr.Markdown("# AI-анализ мониторинга новостей v.1.12")
|
186 |
|
187 |
with gr.Row():
|
188 |
file_input = gr.File(
|
|
|
192 |
)
|
193 |
|
194 |
with gr.Row():
|
195 |
+
col1, col2 = gr.Columns(2)
|
196 |
+
with col1:
|
197 |
+
analyze_btn = gr.Button(
|
198 |
+
"Начать анализ",
|
199 |
+
variant="primary"
|
200 |
+
)
|
201 |
+
with col2:
|
202 |
+
stop_btn = gr.Button(
|
203 |
+
"❌ Остановить",
|
204 |
+
variant="stop"
|
205 |
+
)
|
206 |
|
207 |
with gr.Row():
|
208 |
progress = gr.Textbox(
|
|
|
224 |
with gr.Column():
|
225 |
events_plot = gr.Plot(label="Распределение событий")
|
226 |
|
227 |
+
def stop_processing():
|
228 |
+
control.request_stop()
|
229 |
+
return "Остановка обработки..."
|
230 |
+
|
231 |
def analyze(file_bytes):
|
232 |
if file_bytes is None:
|
233 |
gr.Warning("Пожалуйста, загрузите файл")
|
234 |
return None, None, None, "Ожидание файла..."
|
235 |
|
236 |
try:
|
237 |
+
# Reset stop flag
|
238 |
+
control.reset()
|
239 |
+
|
240 |
file_obj = io.BytesIO(file_bytes)
|
241 |
logger.info("File loaded into BytesIO successfully")
|
242 |
|
|
|
243 |
progress_status = "Начинаем обработку файла..."
|
244 |
yield None, None, None, progress_status
|
245 |
|
246 |
+
# Process file
|
247 |
+
df = pd.read_excel(file_obj, sheet_name='Публикации')
|
248 |
+
logger.info(f"Successfully read Excel file. Shape: {df.shape}")
|
249 |
|
250 |
+
# Deduplication
|
251 |
+
original_count = len(df)
|
252 |
+
df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
|
253 |
+
logger.info(f"Removed {original_count - len(df)} duplicate entries")
|
254 |
|
255 |
+
detector = EventDetector()
|
256 |
+
detector.control = control # Pass control object
|
257 |
+
processed_rows = []
|
258 |
+
total = len(df)
|
259 |
+
|
260 |
+
# Initialize models
|
261 |
+
if not detector.initialize_models():
|
262 |
+
raise Exception("Failed to initialize models")
|
263 |
+
|
264 |
+
for idx, row in df.iterrows():
|
265 |
+
if control.should_stop():
|
266 |
+
yield (
|
267 |
+
pd.DataFrame(processed_rows) if processed_rows else None,
|
268 |
+
None, None,
|
269 |
+
f"Обработка остановлена. Обработано {idx} из {total} строк"
|
270 |
+
)
|
271 |
+
return
|
272 |
+
|
273 |
+
try:
|
274 |
+
text = str(row.get('Выдержки из текста', ''))
|
275 |
+
if not text.strip():
|
276 |
+
continue
|
277 |
+
|
278 |
+
entity = str(row.get('Объект', ''))
|
279 |
+
if not entity.strip():
|
280 |
+
continue
|
281 |
+
|
282 |
+
event_type, event_summary = detector.detect_events(text, entity)
|
283 |
+
sentiment = detector.analyze_sentiment(text)
|
284 |
+
|
285 |
+
processed_rows.append({
|
286 |
+
'Объект': entity,
|
287 |
+
'Заголовок': str(row.get('Заголовок', '')),
|
288 |
+
'Sentiment': sentiment,
|
289 |
+
'Event_Type': event_type,
|
290 |
+
'Event_Summary': event_summary,
|
291 |
+
'Текст': text[:1000]
|
292 |
+
})
|
293 |
+
|
294 |
+
if idx % 5 == 0:
|
295 |
+
progress_status = f"Обработано {idx + 1}/{total} строк"
|
296 |
+
yield None, None, None, progress_status
|
297 |
+
|
298 |
+
except Exception as e:
|
299 |
+
logger.error(f"Error processing row {idx}: {str(e)}")
|
300 |
+
continue
|
301 |
|
302 |
+
result_df = pd.DataFrame(processed_rows)
|
303 |
+
fig_sentiment, fig_events = create_visualizations(result_df)
|
304 |
|
305 |
return (
|
306 |
+
result_df,
|
307 |
+
fig_sentiment,
|
308 |
+
fig_events,
|
309 |
+
f"Обработка завершена успешно! Обработано {len(result_df)} строк"
|
310 |
)
|
311 |
|
312 |
except Exception as e:
|
|
|
315 |
gr.Error(error_msg)
|
316 |
return None, None, None, error_msg
|
317 |
|
318 |
+
stop_btn.click(fn=stop_processing, outputs=[progress])
|
319 |
analyze_btn.click(
|
320 |
fn=analyze,
|
321 |
inputs=[file_input],
|