import os from newsapi import NewsApiClient from dotenv import load_dotenv import pandas as pd from datetime import datetime, timedelta from transformers import pipeline, AutoTokenizer, AutoModel import torch import numpy as np from sklearn.metrics.pairwise import cosine_similarity import spacy import re load_dotenv() class NewsChecker: def __init__(self): self.api_key = os.getenv('NEWS_API_KEY') if not self.api_key: print("WARNING: NEWS_API_KEY not found in environment variables") else: print("NEWS_API_KEY found in environment variables") try: self.newsapi = NewsApiClient(api_key=self.api_key) # Initialize sentiment analyzer self.sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english') # Initialize semantic similarity model self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased') self.model = AutoModel.from_pretrained('distilbert-base-uncased') # Load spaCy model for keyword extraction self.nlp = spacy.load('en_core_web_sm') print("Models initialized successfully") except Exception as e: print(f"Error initializing clients: {str(e)}") def extract_key_terms(self, text): """Extract key product and topic terms from the text""" doc = self.nlp(text) # Extract noun phrases and product-related terms key_terms = [] # Get noun phrases for chunk in doc.noun_chunks: if len(chunk.text.split()) <= 3: # Limit to phrases of 3 words or less key_terms.append(chunk.text.lower()) # Get product-related nouns and adjectives for token in doc: if token.pos_ in ['NOUN', 'PROPN'] and not any(token.text.lower() in term for term in key_terms): key_terms.append(token.text.lower()) # Clean terms cleaned_terms = [] for term in key_terms: # Remove common marketing words if term not in ['introduction', 'collection', 'products', 'items', 'things']: # Clean the term cleaned = re.sub(r'[^\w\s-]', '', term) cleaned = cleaned.strip() if cleaned and len(cleaned) > 2: # Only keep terms longer than 2 characters cleaned_terms.append(cleaned) return list(set(cleaned_terms)) # Remove duplicates def get_embedding(self, text): """Get embedding for a text using DistilBERT""" try: # Tokenize and encode the text inputs = self.tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt") # Get model outputs with torch.no_grad(): outputs = self.model(**inputs) # Use the mean of the last hidden state as the sentence embedding embeddings = outputs.last_hidden_state.mean(dim=1) return embeddings except Exception as e: print(f"Error getting embedding: {str(e)}") return None def calculate_similarity(self, text1_embedding, text2_embedding): """Calculate cosine similarity between two embeddings""" try: # Convert tensors to numpy arrays and reshape emb1 = text1_embedding.numpy().reshape(1, -1) emb2 = text2_embedding.numpy().reshape(1, -1) # Calculate cosine similarity similarity = cosine_similarity(emb1, emb2)[0][0] return similarity except Exception as e: print(f"Error calculating similarity: {str(e)}") return 0.0 def is_negative_news(self, title, description): """Check if the news article has negative sentiment""" try: # Combine title and description for better context text = f"{title} {description}" result = self.sentiment_analyzer(text)[0] # Return True if sentiment is negative with high confidence return result['label'] == 'NEGATIVE' and result['score'] > 0.7 except Exception as e: print(f"Error in sentiment analysis: {str(e)}") return False def get_recent_news(self, marketing_text): if not self.api_key: print("Cannot fetch news: No API key configured") return pd.DataFrame() try: # Extract key terms from marketing text key_terms = self.extract_key_terms(marketing_text) if not key_terms: return pd.DataFrame() # Create search query from key terms search_query = ' OR '.join([f'"{term}"' for term in key_terms[:5]]) # Use top 5 terms print(f"Searching news with query: {search_query}") # Get news from the last 7 days week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d') # Get embedding for marketing text marketing_embedding = self.get_embedding(marketing_text) if marketing_embedding is None: return pd.DataFrame() # Search news with the extracted terms response = self.newsapi.get_everything( q=search_query, from_param=week_ago, language='en', sort_by='relevancy', page_size=50 ) if response['status'] == 'ok': relevant_news = [] for article in response['articles']: if not article['title'] or not article['description']: continue # Check if the article contains any of our key terms article_text = f"{article['title'].lower()} {article['description'].lower()}" if not any(term in article_text for term in key_terms): continue # Get embedding for article article_embedding = self.get_embedding(article_text) if article_embedding is None: continue # Calculate semantic similarity similarity = self.calculate_similarity(marketing_embedding, article_embedding) # Check if article is both semantically similar and negative if similarity > 0.6 and self.is_negative_news(article['title'], article['description']): relevant_news.append({ 'title': article['title'], 'description': article['description'], 'similarity': similarity }) # Sort by similarity and convert to DataFrame relevant_news.sort(key=lambda x: x['similarity'], reverse=True) return pd.DataFrame(relevant_news) return pd.DataFrame() except Exception as e: print(f"Error fetching news: {str(e)}") return pd.DataFrame() def check_content_against_news(self, marketing_text): news_df = self.get_recent_news(marketing_text) if news_df.empty: return { 'status': 'pass', 'message': 'No relevant negative news found.' } # Get the top 3 most similar negative news articles top_news = news_df.head(3) if not top_news.empty: message = 'Found semantically relevant negative news that might impact your marketing:\n' for _, row in top_news.iterrows(): message += f"- {row['title']} (Similarity: {row['similarity']:.2f})\n" return { 'status': 'warning', 'message': message } return { 'status': 'pass', 'message': 'No relevant negative news found.' }