import os
from newsapi import NewsApiClient
from dotenv import load_dotenv
import pandas as pd
from datetime import datetime, timedelta
from transformers import pipeline, AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import re

load_dotenv()

class NewsChecker:
    def __init__(self):
        self.api_key = os.getenv('NEWS_API_KEY')
        if not self.api_key:
            print("WARNING: NEWS_API_KEY not found in environment variables")
        else:
            print("NEWS_API_KEY found in environment variables")
            
        try:
            self.newsapi = NewsApiClient(api_key=self.api_key)
            # Initialize sentiment analyzer
            self.sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
            # Initialize semantic similarity model
            self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
            self.model = AutoModel.from_pretrained('distilbert-base-uncased')
            # Load spaCy model for keyword extraction
            self.nlp = spacy.load('en_core_web_sm')
            print("Models initialized successfully")
        except Exception as e:
            print(f"Error initializing clients: {str(e)}")

    def extract_key_terms(self, text):
        """Extract key product and topic terms from the text"""
        doc = self.nlp(text)
        
        # Extract noun phrases and product-related terms
        key_terms = []
        
        # Get noun phrases
        for chunk in doc.noun_chunks:
            if len(chunk.text.split()) <= 3:  # Limit to phrases of 3 words or less
                key_terms.append(chunk.text.lower())
        
        # Get product-related nouns and adjectives
        for token in doc:
            if token.pos_ in ['NOUN', 'PROPN'] and not any(token.text.lower() in term for term in key_terms):
                key_terms.append(token.text.lower())
        
        # Clean terms
        cleaned_terms = []
        for term in key_terms:
            # Remove common marketing words
            if term not in ['introduction', 'collection', 'products', 'items', 'things']:
                # Clean the term
                cleaned = re.sub(r'[^\w\s-]', '', term)
                cleaned = cleaned.strip()
                if cleaned and len(cleaned) > 2:  # Only keep terms longer than 2 characters
                    cleaned_terms.append(cleaned)
        
        return list(set(cleaned_terms))  # Remove duplicates

    def get_embedding(self, text):
        """Get embedding for a text using DistilBERT"""
        try:
            # Tokenize and encode the text
            inputs = self.tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
            
            # Get model outputs
            with torch.no_grad():
                outputs = self.model(**inputs)
            
            # Use the mean of the last hidden state as the sentence embedding
            embeddings = outputs.last_hidden_state.mean(dim=1)
            
            return embeddings
        except Exception as e:
            print(f"Error getting embedding: {str(e)}")
            return None

    def calculate_similarity(self, text1_embedding, text2_embedding):
        """Calculate cosine similarity between two embeddings"""
        try:
            # Convert tensors to numpy arrays and reshape
            emb1 = text1_embedding.numpy().reshape(1, -1)
            emb2 = text2_embedding.numpy().reshape(1, -1)
            
            # Calculate cosine similarity
            similarity = cosine_similarity(emb1, emb2)[0][0]
            return similarity
        except Exception as e:
            print(f"Error calculating similarity: {str(e)}")
            return 0.0

    def is_negative_news(self, title, description):
        """Check if the news article has negative sentiment"""
        try:
            # Combine title and description for better context
            text = f"{title} {description}"
            result = self.sentiment_analyzer(text)[0]
            
            # Return True if sentiment is negative with high confidence
            return result['label'] == 'NEGATIVE' and result['score'] > 0.7
        except Exception as e:
            print(f"Error in sentiment analysis: {str(e)}")
            return False

    def get_recent_news(self, marketing_text):
        if not self.api_key:
            print("Cannot fetch news: No API key configured")
            return pd.DataFrame()
            
        try:
            # Extract key terms from marketing text
            key_terms = self.extract_key_terms(marketing_text)
            if not key_terms:
                return pd.DataFrame()
            
            # Create search query from key terms
            search_query = ' OR '.join([f'"{term}"' for term in key_terms[:5]])  # Use top 5 terms
            print(f"Searching news with query: {search_query}")
            
            # Get news from the last 7 days
            week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
            
            # Get embedding for marketing text
            marketing_embedding = self.get_embedding(marketing_text)
            if marketing_embedding is None:
                return pd.DataFrame()
            
            # Search news with the extracted terms
            response = self.newsapi.get_everything(
                q=search_query,
                from_param=week_ago,
                language='en',
                sort_by='relevancy',
                page_size=50
            )
            
            if response['status'] == 'ok':
                relevant_news = []
                
                for article in response['articles']:
                    if not article['title'] or not article['description']:
                        continue
                    
                    # Check if the article contains any of our key terms
                    article_text = f"{article['title'].lower()} {article['description'].lower()}"
                    if not any(term in article_text for term in key_terms):
                        continue
                        
                    # Get embedding for article
                    article_embedding = self.get_embedding(article_text)
                    
                    if article_embedding is None:
                        continue
                    
                    # Calculate semantic similarity
                    similarity = self.calculate_similarity(marketing_embedding, article_embedding)
                    
                    # Check if article is both semantically similar and negative
                    if similarity > 0.6 and self.is_negative_news(article['title'], article['description']):
                        relevant_news.append({
                            'title': article['title'],
                            'description': article['description'],
                            'similarity': similarity
                        })
                
                # Sort by similarity and convert to DataFrame
                relevant_news.sort(key=lambda x: x['similarity'], reverse=True)
                return pd.DataFrame(relevant_news)
            
            return pd.DataFrame()
                
        except Exception as e:
            print(f"Error fetching news: {str(e)}")
            return pd.DataFrame()
    
    def check_content_against_news(self, marketing_text):
        news_df = self.get_recent_news(marketing_text)
        
        if news_df.empty:
            return {
                'status': 'pass',
                'message': 'No relevant negative news found.'
            }
        
        # Get the top 3 most similar negative news articles
        top_news = news_df.head(3)
        
        if not top_news.empty:
            message = 'Found semantically relevant negative news that might impact your marketing:\n'
            for _, row in top_news.iterrows():
                message += f"- {row['title']} (Similarity: {row['similarity']:.2f})\n"
            
            return {
                'status': 'warning',
                'message': message
            }
        
        return {
            'status': 'pass',
            'message': 'No relevant negative news found.'
        }