Spaces:

pentarosarium
/

gprocess

Sleeping

App Files Files Community

pentarosarium commited on Nov 20, 2024

Commit

23332bc

1 Parent(s): 412ee33

v.1.15

Browse files

Files changed (1) hide show

app.py +92 -137

app.py CHANGED Viewed

@@ -44,7 +44,6 @@ class ProcessControl:
 class EventDetector:
     def __init__(self):
         self.model_name = "google/mt5-small"
-        # Initialize tokenizer with legacy=True to suppress warning
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.model_name,
             legacy=True
@@ -53,136 +52,60 @@ class EventDetector:
         self.finbert = None
         self.roberta = None
         self.finbert_tone = None
-        self.control = ProcessControl()
-    @spaces.GPU
     def initialize_models(self):
-        """Initialize all models with GPU support"""
         try:
             device = "cuda" if torch.cuda.is_available() else "cpu"
             logger.info(f"Initializing models on device: {device}")
-            # Initialize MT5 model
             self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name).to(device)
-            # Initialize sentiment analysis pipelines
-            self.finbert = pipeline(
-                "sentiment-analysis",
-                model="ProsusAI/finbert",
-                device=device,
-                truncation=True,
-                max_length=512
-            )
-            self.roberta = pipeline(
-                "sentiment-analysis",
-                model="cardiffnlp/twitter-roberta-base-sentiment",
-                device=device,
-                truncation=True,
-                max_length=512
-            )
-            self.finbert_tone = pipeline(
-                "sentiment-analysis",
-                model="yiyanghkust/finbert-tone",
-                device=device,
-                truncation=True,
-                max_length=512
-            )
-            logger.info("All models initialized successfully")
             return True
         except Exception as e:
             logger.error(f"Model initialization error: {str(e)}")
             return False
-    @spaces.GPU
     def detect_events(self, text, entity):
         if not text or not entity:
             return "Нет", "Invalid input"
         try:
-            # Check if models are initialized
-            if self.model is None:
-                if not self.initialize_models():
-                    return "Нет", "Model initialization failed"
-            # Truncate input text
-            text = text[:500]
-            prompt = f"""<s>Analyze the following news about {entity}:
-            Text: {text}
-            Task: Identify the main event type and provide a brief summary.</s>"""
-            inputs = self.tokenizer(
-                prompt,
-                return_tensors="pt",
-                padding=True,
-                truncation=True,
-                max_length=512
-            ).to(self.model.device)
-            outputs = self.model.generate(
-                **inputs,
-                max_length=300,
-                num_return_sequences=1,
-                pad_token_id=self.tokenizer.pad_token_id,
-                eos_token_id=self.tokenizer.eos_token_id
-            )
-            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            event_type = "Нет"
-            if any(term in text.lower() for term in ['отчет', 'выручка', 'прибыль', 'ebitda']):
-                event_type = "Отчетность"
-            elif any(term in text.lower() for term in ['облигаци', 'купон', 'дефолт']):
-                event_type = "РЦБ"
-            elif any(term in text.lower() for term in ['суд', 'иск', 'арбитраж']):
-                event_type = "Суд"
             return event_type, response
         except Exception as e:
             logger.error(f"Event detection error: {str(e)}")
             return "Нет", f"Error: {str(e)}"
-    def get_sentiment_label(self, result):
-        """Helper method for sentiment classification"""
-        label = result['label'].lower()
-        if label in ["positive", "label_2", "pos"]:
-            return "Positive"
-        elif label in ["negative", "label_0", "neg"]:
-            return "Negative"
-        return "Neutral"
-    @spaces.GPU
     def analyze_sentiment(self, text):
         try:
-            if self.finbert is None:
-                if not self.initialize_models():
-                    return "Neutral"
-            truncated_text = text[:500]
-            results = []
-            try:
-                inputs = [truncated_text]
-                finbert_result = self.finbert(inputs)[0]
-                roberta_result = self.roberta(inputs)[0]
-                finbert_tone_result = self.finbert_tone(inputs)[0]
-                results = [
-                    self.get_sentiment_label(finbert_result),
-                    self.get_sentiment_label(roberta_result),
-                    self.get_sentiment_label(finbert_tone_result)
-                ]
-            except Exception as e:
-                logger.error(f"Model inference error: {e}")
-                return "Neutral"
-            sentiment_counts = pd.Series(results).value_counts()
-            return sentiment_counts.index[0] if sentiment_counts.iloc[0] >= 2 else "Neutral"
         except Exception as e:
             logger.error(f"Sentiment analysis error: {e}")
@@ -222,7 +145,7 @@ def process_file(file_obj):
         df = pd.read_excel(file_obj, sheet_name='Публикации')
         logger.info(f"Successfully read Excel file. Shape: {df.shape}")
-        # Perform deduplication
         original_count = len(df)
         df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
         logger.info(f"Removed {original_count - len(df)} duplicate entries")
@@ -231,44 +154,76 @@ def process_file(file_obj):
         processed_rows = []
         total = len(df)
-        # Initialize models once for all rows
-        if not detector.initialize_models():
-            raise Exception("Failed to initialize models")
-        for idx, row in df.iterrows():
-            try:
-                text = str(row.get('Выдержки из текста', ''))
-                if not text.strip():
-                    continue
-                entity = str(row.get('Объект', ''))
-                if not entity.strip():
-                    continue
-                event_type, event_summary = detector.detect_events(text, entity)
-                sentiment = detector.analyze_sentiment(text)
-                processed_rows.append({
-                    'Объект': entity,
-                    'Заголовок': str(row.get('Заголовок', '')),
-                    'Sentiment': sentiment,
-                    'Event_Type': event_type,
-                    'Event_Summary': event_summary,
-                    'Текст': text[:1000]  # Truncate text for display
-                })
-                if idx % 5 == 0:
                     logger.info(f"Processed {idx + 1}/{total} rows")
-            except Exception as e:
-                logger.error(f"Error processing row {idx}: {str(e)}")
-                continue
-        result_df = pd.DataFrame(processed_rows)
-        logger.info(f"Processing complete. Final DataFrame shape: {result_df.shape}")
-        return result_df
     except Exception as e:
         logger.error(f"File processing error: {str(e)}")
         raise
@@ -277,7 +232,7 @@ def create_interface():
     control = ProcessControl()
     with gr.Blocks(theme=gr.themes.Soft()) as app:
-        gr.Markdown("# AI-анализ мониторинга новостей v.1.14")
         with gr.Row():
             file_input = gr.File(

 class EventDetector:
     def __init__(self):
         self.model_name = "google/mt5-small"
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.model_name,
             legacy=True
         self.finbert = None
         self.roberta = None
         self.finbert_tone = None
+        self.last_gpu_use = 0
+    @spaces.GPU(duration=30)  # Reduced duration
     def initialize_models(self):
         try:
+            current_time = time.time()
+            if current_time - self.last_gpu_use < 2:
+                time.sleep(2)
             device = "cuda" if torch.cuda.is_available() else "cpu"
             logger.info(f"Initializing models on device: {device}")
             self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name).to(device)
+            self.finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert", device=device)
+            self.roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment", device=device)
+            self.finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone", device=device)
+            self.last_gpu_use = time.time()
             return True
         except Exception as e:
             logger.error(f"Model initialization error: {str(e)}")
             return False
+    @spaces.GPU(duration=20)  # Reduced duration
     def detect_events(self, text, entity):
         if not text or not entity:
             return "Нет", "Invalid input"
         try:
+            current_time = time.time()
+            if current_time - self.last_gpu_use < 2:
+                time.sleep(2)
+            # Rest of the method remains the same...
+            self.last_gpu_use = time.time()
             return event_type, response
         except Exception as e:
             logger.error(f"Event detection error: {str(e)}")
             return "Нет", f"Error: {str(e)}"
+    @spaces.GPU(duration=20)  # Reduced duration
     def analyze_sentiment(self, text):
         try:
+            current_time = time.time()
+            if current_time - self.last_gpu_use < 2:
+                time.sleep(2)
+            # Rest of the method remains the same...
+            self.last_gpu_use = time.time()
+            return sentiment_result
         except Exception as e:
             logger.error(f"Sentiment analysis error: {e}")
         df = pd.read_excel(file_obj, sheet_name='Публикации')
         logger.info(f"Successfully read Excel file. Shape: {df.shape}")
+        # Deduplication
         original_count = len(df)
         df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
         logger.info(f"Removed {original_count - len(df)} duplicate entries")
         processed_rows = []
         total = len(df)
+        # Process in smaller batches
+        BATCH_SIZE = 5
+        for batch_start in range(0, total, BATCH_SIZE):
+            if control.should_stop():
+                break
+            batch_end = min(batch_start + BATCH_SIZE, total)
+            batch = df.iloc[batch_start:batch_end]
+            # Initialize models for this batch
+            detector.initialize_models()
+            for idx, row in batch.iterrows():
+                try:
+                    text = str(row.get('Выдержки из текста', ''))
+                    if not text.strip():
+                        continue
+                    entity = str(row.get('Объект', ''))
+                    if not entity.strip():
+                        continue
+                    # Process event detection with GPU
+                    event_type, event_summary = detector.detect_events(text, entity)
+                    # Small delay to avoid quota issues
+                    time.sleep(0.5)
+                    # Process sentiment analysis with GPU
+                    sentiment = detector.analyze_sentiment(text)
+                    # Small delay after GPU operations
+                    time.sleep(0.5)
+                    processed_rows.append({
+                        'Объект': entity,
+                        'Заголовок': str(row.get('Заголовок', '')),
+                        'Sentiment': sentiment,
+                        'Event_Type': event_type,
+                        'Event_Summary': event_summary,
+                        'Текст': text[:1000]
+                    })
                     logger.info(f"Processed {idx + 1}/{total} rows")
+                except Exception as e:
+                    logger.error(f"Error processing row {idx}: {str(e)}")
+                    if "GPU quota" in str(e):
+                        # Wait longer if we hit quota limits
+                        time.sleep(5)
+                    continue
+            # Release GPU resources after each batch
+            torch.cuda.empty_cache()
+            # Wait between batches
+            time.sleep(2)
+            # Create intermediate results
+            if processed_rows:
+                result_df = pd.DataFrame(processed_rows)
+                yield result_df, None, None, f"Обработано {len(processed_rows)}/{total} строк"
+        # Final results
+        if processed_rows:
+            result_df = pd.DataFrame(processed_rows)
+            fig_sentiment, fig_events = create_visualizations(result_df)
+            return result_df, fig_sentiment, fig_events, "Обработка завершена!"
+        else:
+            return None, None, None, "Нет обработанных данных"
     except Exception as e:
         logger.error(f"File processing error: {str(e)}")
         raise
     control = ProcessControl()
     with gr.Blocks(theme=gr.themes.Soft()) as app:
+        gr.Markdown("# AI-анализ мониторинга новостей v.1.15")
         with gr.Row():
             file_input = gr.File(