pentarosarium commited on
Commit
e20a82b
·
1 Parent(s): f0111d1
Files changed (1) hide show
  1. app.py +51 -23
app.py CHANGED
@@ -6,6 +6,24 @@ from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
6
  import plotly.graph_objects as go
7
  import logging
8
  import io
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  logging.basicConfig(level=logging.INFO)
11
  logger = logging.getLogger(__name__)
@@ -14,7 +32,6 @@ class EventDetector:
14
  def __init__(self):
15
  self.model_name = "google/mt5-small"
16
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
17
- # Don't initialize models in __init__
18
  self.model = None
19
  self.finbert = None
20
  self.roberta = None
@@ -22,7 +39,6 @@ class EventDetector:
22
 
23
  @spaces.GPU
24
  def initialize_models(self):
25
- """Initialize all models with GPU support"""
26
  try:
27
  device = "cuda" if torch.cuda.is_available() else "cpu"
28
  logger.info(f"Initializing models on device: {device}")
@@ -43,12 +59,14 @@ class EventDetector:
43
  return "Нет", "Invalid input"
44
 
45
  try:
46
- # Initialize models if needed
47
  if self.model is None:
48
  if not self.initialize_models():
49
  return "Нет", "Model initialization failed"
50
 
51
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
52
  prompt = f"""<s>Analyze the following news about {entity}:
53
  Text: {text}
54
  Task: Identify the main event type and provide a brief summary.</s>"""
@@ -76,21 +94,30 @@ class EventDetector:
76
  @spaces.GPU
77
  def analyze_sentiment(self, text):
78
  try:
79
- # Initialize models if needed
80
  if self.finbert is None:
81
  if not self.initialize_models():
82
  return "Neutral"
83
 
84
- results = []
85
- texts = [text[:512]] # Truncate to avoid token length issues
86
 
87
- for model in [self.finbert, self.roberta, self.finbert_tone]:
88
- try:
89
- result = model(texts)[0]
90
- results.append(self._get_sentiment(result))
91
- except Exception as e:
92
- logger.error(f"Model inference error: {e}")
93
- results.append("Neutral")
 
 
 
 
 
 
 
 
 
 
94
 
95
  sentiment_counts = pd.Series(results).value_counts()
96
  return sentiment_counts.index[0] if sentiment_counts.iloc[0] >= 2 else "Neutral"
@@ -99,14 +126,6 @@ class EventDetector:
99
  logger.error(f"Sentiment analysis error: {e}")
100
  return "Neutral"
101
 
102
- def _get_sentiment(self, result):
103
- label = result['label'].lower()
104
- if label in ["positive", "label_2", "pos"]:
105
- return "Positive"
106
- elif label in ["negative", "label_0", "neg"]:
107
- return "Negative"
108
- return "Neutral"
109
-
110
  def create_visualizations(df):
111
  if df is None or df.empty:
112
  return None, None
@@ -141,10 +160,19 @@ def process_file(file_obj):
141
  df = pd.read_excel(file_obj, sheet_name='Публикации')
142
  logger.info(f"Successfully read Excel file. Shape: {df.shape}")
143
 
 
 
 
 
 
144
  detector = EventDetector()
145
  processed_rows = []
146
  total = len(df)
147
 
 
 
 
 
148
  for idx, row in df.iterrows():
149
  try:
150
  text = str(row.get('Выдержки из текста', ''))
@@ -164,7 +192,7 @@ def process_file(file_obj):
164
  'Sentiment': sentiment,
165
  'Event_Type': event_type,
166
  'Event_Summary': event_summary,
167
- 'Текст': text
168
  })
169
 
170
  if idx % 5 == 0:
@@ -185,7 +213,7 @@ def process_file(file_obj):
185
 
186
  def create_interface():
187
  with gr.Blocks(theme=gr.themes.Soft()) as app:
188
- gr.Markdown("# AI-анализ мониторинга новостей v.1.10")
189
 
190
  with gr.Row():
191
  file_input = gr.File(
 
6
  import plotly.graph_objects as go
7
  import logging
8
  import io
9
+ from rapidfuzz import fuzz
10
+
11
+ def fuzzy_deduplicate(df, column, threshold=55):
12
+ """Deduplicate rows based on fuzzy matching of text content"""
13
+ seen_texts = []
14
+ indices_to_keep = []
15
+
16
+ for i, text in enumerate(df[column]):
17
+ if pd.isna(text):
18
+ indices_to_keep.append(i)
19
+ continue
20
+
21
+ text = str(text)
22
+ if not seen_texts or all(fuzz.ratio(text, seen) < threshold for seen in seen_texts):
23
+ seen_texts.append(text)
24
+ indices_to_keep.append(i)
25
+
26
+ return df.iloc[indices_to_keep]
27
 
28
  logging.basicConfig(level=logging.INFO)
29
  logger = logging.getLogger(__name__)
 
32
  def __init__(self):
33
  self.model_name = "google/mt5-small"
34
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
35
  self.model = None
36
  self.finbert = None
37
  self.roberta = None
 
39
 
40
  @spaces.GPU
41
  def initialize_models(self):
 
42
  try:
43
  device = "cuda" if torch.cuda.is_available() else "cpu"
44
  logger.info(f"Initializing models on device: {device}")
 
59
  return "Нет", "Invalid input"
60
 
61
  try:
 
62
  if self.model is None:
63
  if not self.initialize_models():
64
  return "Нет", "Model initialization failed"
65
 
66
  device = "cuda" if torch.cuda.is_available() else "cpu"
67
+ # Truncate input text to avoid tensor size mismatch
68
+ text = text[:500] # Adjust this value if needed
69
+
70
  prompt = f"""<s>Analyze the following news about {entity}:
71
  Text: {text}
72
  Task: Identify the main event type and provide a brief summary.</s>"""
 
94
  @spaces.GPU
95
  def analyze_sentiment(self, text):
96
  try:
 
97
  if self.finbert is None:
98
  if not self.initialize_models():
99
  return "Neutral"
100
 
101
+ # Truncate text to avoid tensor size issues
102
+ truncated_text = text[:500]
103
 
104
+ results = []
105
+ try:
106
+ # Process text with all models in a batch
107
+ inputs = [truncated_text]
108
+ finbert_result = self.finbert(inputs, truncation=True, max_length=512)[0]
109
+ roberta_result = self.roberta(inputs, truncation=True, max_length=512)[0]
110
+ finbert_tone_result = self.finbert_tone(inputs, truncation=True, max_length=512)[0]
111
+
112
+ results = [
113
+ self._get_sentiment(finbert_result),
114
+ self._get_sentiment(roberta_result),
115
+ self._get_sentiment(finbert_tone_result)
116
+ ]
117
+
118
+ except Exception as e:
119
+ logger.error(f"Model inference error: {e}")
120
+ return "Neutral"
121
 
122
  sentiment_counts = pd.Series(results).value_counts()
123
  return sentiment_counts.index[0] if sentiment_counts.iloc[0] >= 2 else "Neutral"
 
126
  logger.error(f"Sentiment analysis error: {e}")
127
  return "Neutral"
128
 
 
 
 
 
 
 
 
 
129
  def create_visualizations(df):
130
  if df is None or df.empty:
131
  return None, None
 
160
  df = pd.read_excel(file_obj, sheet_name='Публикации')
161
  logger.info(f"Successfully read Excel file. Shape: {df.shape}")
162
 
163
+ # Perform deduplication
164
+ original_count = len(df)
165
+ df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
166
+ logger.info(f"Removed {original_count - len(df)} duplicate entries")
167
+
168
  detector = EventDetector()
169
  processed_rows = []
170
  total = len(df)
171
 
172
+ # Initialize models once for all rows
173
+ if not detector.initialize_models():
174
+ raise Exception("Failed to initialize models")
175
+
176
  for idx, row in df.iterrows():
177
  try:
178
  text = str(row.get('Выдержки из текста', ''))
 
192
  'Sentiment': sentiment,
193
  'Event_Type': event_type,
194
  'Event_Summary': event_summary,
195
+ 'Текст': text[:1000] # Truncate text for display
196
  })
197
 
198
  if idx % 5 == 0:
 
213
 
214
  def create_interface():
215
  with gr.Blocks(theme=gr.themes.Soft()) as app:
216
+ gr.Markdown("# AI-анализ мониторинга новостей v.1.11")
217
 
218
  with gr.Row():
219
  file_input = gr.File(