Spaces:

pentarosarium
/

gprocess

Sleeping

App Files Files Community

pentarosarium commited on Nov 20, 2024

Commit

e20a82b

1 Parent(s): f0111d1

v.1.11

Browse files

Files changed (1) hide show

app.py +51 -23

app.py CHANGED Viewed

@@ -6,6 +6,24 @@ from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
 import plotly.graph_objects as go
 import logging
 import io
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -14,7 +32,6 @@ class EventDetector:
     def __init__(self):
         self.model_name = "google/mt5-small"
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-        # Don't initialize models in __init__
         self.model = None
         self.finbert = None
         self.roberta = None
@@ -22,7 +39,6 @@ class EventDetector:
     @spaces.GPU
     def initialize_models(self):
-        """Initialize all models with GPU support"""
         try:
             device = "cuda" if torch.cuda.is_available() else "cpu"
             logger.info(f"Initializing models on device: {device}")
@@ -43,12 +59,14 @@ class EventDetector:
             return "Нет", "Invalid input"
         try:
-            # Initialize models if needed
             if self.model is None:
                 if not self.initialize_models():
                     return "Нет", "Model initialization failed"
             device = "cuda" if torch.cuda.is_available() else "cpu"
             prompt = f"""<s>Analyze the following news about {entity}:
             Text: {text}
             Task: Identify the main event type and provide a brief summary.</s>"""
@@ -76,21 +94,30 @@ class EventDetector:
     @spaces.GPU
     def analyze_sentiment(self, text):
         try:
-            # Initialize models if needed
             if self.finbert is None:
                 if not self.initialize_models():
                     return "Neutral"
-            results = []
-            texts = [text[:512]]  # Truncate to avoid token length issues
-            for model in [self.finbert, self.roberta, self.finbert_tone]:
-                try:
-                    result = model(texts)[0]
-                    results.append(self._get_sentiment(result))
-                except Exception as e:
-                    logger.error(f"Model inference error: {e}")
-                    results.append("Neutral")
             sentiment_counts = pd.Series(results).value_counts()
             return sentiment_counts.index[0] if sentiment_counts.iloc[0] >= 2 else "Neutral"
@@ -99,14 +126,6 @@ class EventDetector:
             logger.error(f"Sentiment analysis error: {e}")
             return "Neutral"
-    def _get_sentiment(self, result):
-        label = result['label'].lower()
-        if label in ["positive", "label_2", "pos"]:
-            return "Positive"
-        elif label in ["negative", "label_0", "neg"]:
-            return "Negative"
-        return "Neutral"
 def create_visualizations(df):
     if df is None or df.empty:
         return None, None
@@ -141,10 +160,19 @@ def process_file(file_obj):
         df = pd.read_excel(file_obj, sheet_name='Публикации')
         logger.info(f"Successfully read Excel file. Shape: {df.shape}")
         detector = EventDetector()
         processed_rows = []
         total = len(df)
         for idx, row in df.iterrows():
             try:
                 text = str(row.get('Выдержки из текста', ''))
@@ -164,7 +192,7 @@ def process_file(file_obj):
                     'Sentiment': sentiment,
                     'Event_Type': event_type,
                     'Event_Summary': event_summary,
-                    'Текст': text
                 })
                 if idx % 5 == 0:
@@ -185,7 +213,7 @@ def process_file(file_obj):
 def create_interface():
     with gr.Blocks(theme=gr.themes.Soft()) as app:
-        gr.Markdown("# AI-анализ мониторинга новостей v.1.10")
         with gr.Row():
             file_input = gr.File(

 import plotly.graph_objects as go
 import logging
 import io
+from rapidfuzz import fuzz
+def fuzzy_deduplicate(df, column, threshold=55):
+    """Deduplicate rows based on fuzzy matching of text content"""
+    seen_texts = []
+    indices_to_keep = []
+    for i, text in enumerate(df[column]):
+        if pd.isna(text):
+            indices_to_keep.append(i)
+            continue
+        text = str(text)
+        if not seen_texts or all(fuzz.ratio(text, seen) < threshold for seen in seen_texts):
+            seen_texts.append(text)
+            indices_to_keep.append(i)
+    return df.iloc[indices_to_keep]
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
     def __init__(self):
         self.model_name = "google/mt5-small"
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
         self.model = None
         self.finbert = None
         self.roberta = None
     @spaces.GPU
     def initialize_models(self):
         try:
             device = "cuda" if torch.cuda.is_available() else "cpu"
             logger.info(f"Initializing models on device: {device}")
             return "Нет", "Invalid input"
         try:
             if self.model is None:
                 if not self.initialize_models():
                     return "Нет", "Model initialization failed"
             device = "cuda" if torch.cuda.is_available() else "cpu"
+            # Truncate input text to avoid tensor size mismatch
+            text = text[:500]  # Adjust this value if needed
             prompt = f"""<s>Analyze the following news about {entity}:
             Text: {text}
             Task: Identify the main event type and provide a brief summary.</s>"""
     @spaces.GPU
     def analyze_sentiment(self, text):
         try:
             if self.finbert is None:
                 if not self.initialize_models():
                     return "Neutral"
+            # Truncate text to avoid tensor size issues
+            truncated_text = text[:500]
+            results = []
+            try:
+                # Process text with all models in a batch
+                inputs = [truncated_text]
+                finbert_result = self.finbert(inputs, truncation=True, max_length=512)[0]
+                roberta_result = self.roberta(inputs, truncation=True, max_length=512)[0]
+                finbert_tone_result = self.finbert_tone(inputs, truncation=True, max_length=512)[0]
+                results = [
+                    self._get_sentiment(finbert_result),
+                    self._get_sentiment(roberta_result),
+                    self._get_sentiment(finbert_tone_result)
+                ]
+            except Exception as e:
+                logger.error(f"Model inference error: {e}")
+                return "Neutral"
             sentiment_counts = pd.Series(results).value_counts()
             return sentiment_counts.index[0] if sentiment_counts.iloc[0] >= 2 else "Neutral"
             logger.error(f"Sentiment analysis error: {e}")
             return "Neutral"
 def create_visualizations(df):
     if df is None or df.empty:
         return None, None
         df = pd.read_excel(file_obj, sheet_name='Публикации')
         logger.info(f"Successfully read Excel file. Shape: {df.shape}")
+        # Perform deduplication
+        original_count = len(df)
+        df = fuzzy_deduplicate(df, 'Выдержки из текста', threshold=55)
+        logger.info(f"Removed {original_count - len(df)} duplicate entries")
         detector = EventDetector()
         processed_rows = []
         total = len(df)
+        # Initialize models once for all rows
+        if not detector.initialize_models():
+            raise Exception("Failed to initialize models")
         for idx, row in df.iterrows():
             try:
                 text = str(row.get('Выдержки из текста', ''))
                     'Sentiment': sentiment,
                     'Event_Type': event_type,
                     'Event_Summary': event_summary,
+                    'Текст': text[:1000]  # Truncate text for display
                 })
                 if idx % 5 == 0:
 def create_interface():
     with gr.Blocks(theme=gr.themes.Soft()) as app:
+        gr.Markdown("# AI-анализ мониторинга новостей v.1.11")
         with gr.Row():
             file_input = gr.File(