Spaces:

ozgurunlu
/

m-check

Sleeping

App Files Files Community

Ozgur Unlu commited on Nov 6, 2024

Commit

f73d159

1 Parent(s): 800031b

better news search test

Browse files

Files changed (2) hide show

news_checker.py +128 -33
requirements.txt +1 -2

news_checker.py CHANGED Viewed

@@ -3,6 +3,11 @@ from newsapi import NewsApiClient
 from dotenv import load_dotenv
 import pandas as pd
 from datetime import datetime, timedelta
 load_dotenv()
@@ -16,10 +21,66 @@ class NewsChecker:
         try:
             self.newsapi = NewsApiClient(api_key=self.api_key)
         except Exception as e:
             print(f"Error initializing NewsAPI client: {str(e)}")
-    def get_recent_news(self):
         if not self.api_key:
             print("Cannot fetch news: No API key configured")
             return pd.DataFrame()
@@ -27,60 +88,94 @@ class NewsChecker:
         try:
             # Get news from the last 7 days
             week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
-            response = self.newsapi.get_everything(
-                q='',
-                from_param=week_ago,
                 language='en',
-                sort_by='relevancy',
-                page_size=100
             )
-            if response['status'] == 'ok':
-                articles = response['articles']
-                # Extract titles and descriptions
-                news_data = [
-                    {
-                        'title': article['title'],
-                        'description': article['description']
-                    }
-                    for article in articles if article['description']
-                ]
-                print(f"Successfully fetched {len(news_data)} articles")
-                return pd.DataFrame(news_data)
-            else:
-                print(f"NewsAPI response status was not 'ok': {response.get('status')}")
-                return pd.DataFrame()
         except Exception as e:
             print(f"Error fetching news: {str(e)}")
             return pd.DataFrame()
     def check_content_against_news(self, marketing_text):
-        news_df = self.get_recent_news()
         if news_df.empty:
             return {
                 'status': 'warning',
                 'message': 'Unable to check against current news context. Proceed with caution.'
             }
-        # Simple keyword matching for demo purposes
-        marketing_words = set(marketing_text.lower().split())
         potential_conflicts = []
         for _, row in news_df.iterrows():
-            title_words = set(row['title'].lower().split())
-            desc_words = set(str(row['description']).lower().split())
-            # Check for significant word overlap
-            if len(marketing_words.intersection(title_words)) >= 3:
                 potential_conflicts.append(row['title'])
         if potential_conflicts:
             return {
                 'status': 'warning',
-                'message': 'Potential conflicts found with current news:\n- ' + '\n- '.join(potential_conflicts)
             }
         return {
             'status': 'pass',
             'message': 'No significant conflicts with current news found.'

 from dotenv import load_dotenv
 import pandas as pd
 from datetime import datetime, timedelta
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.tag import pos_tag
+from nltk.chunk import ne_chunk
+from collections import Counter
 load_dotenv()
         try:
             self.newsapi = NewsApiClient(api_key=self.api_key)
+            # Download required NLTK data
+            nltk.download('punkt', quiet=True)
+            nltk.download('averaged_perceptron_tagger', quiet=True)
+            nltk.download('maxent_ne_chunker', quiet=True)
+            nltk.download('words', quiet=True)
         except Exception as e:
             print(f"Error initializing NewsAPI client: {str(e)}")
+    def extract_keywords(self, text, max_keywords=3):
+        """Extract meaningful keywords from text using NLP techniques"""
+        try:
+            # Tokenize and tag parts of speech
+            tokens = word_tokenize(text)
+            tagged = pos_tag(tokens)
+            # Extract named entities
+            named_entities = []
+            chunks = ne_chunk(tagged)
+            for chunk in chunks:
+                if hasattr(chunk, 'label'):
+                    named_entities.append(' '.join(c[0] for c in chunk))
+            # Extract nouns and adjectives (excluding common words)
+            common_words = {'new', 'great', 'good', 'best', 'better', 'more', 'most',
+                          'today', 'now', 'get', 'our', 'your', 'their', 'this', 'that',
+                          'these', 'those', 'here', 'there', 'when', 'where', 'who',
+                          'what', 'why', 'how', 'all', 'any', 'both', 'each', 'few',
+                          'more', 'most', 'other', 'some', 'such', 'only', 'own',
+                          'same', 'than', 'too', 'very', 'can', 'will', 'just', 'should',
+                          'features', 'feature', 'offers', 'offer', 'price', 'prices'}
+            important_words = []
+            for word, tag in tagged:
+                # NN* for nouns, JJ* for adjectives
+                if (tag.startswith('NN') or tag.startswith('JJ')) and \
+                   word.lower() not in common_words and \
+                   len(word) > 2:
+                    important_words.append(word.lower())
+            # Combine named entities and important words, count frequencies
+            all_keywords = named_entities + important_words
+            keyword_freq = Counter(all_keywords)
+            # Get most common keywords
+            main_keywords = [word for word, count in keyword_freq.most_common(max_keywords)]
+            # If no keywords found, return None to trigger general news search
+            if not main_keywords:
+                return None
+            # Create search query
+            search_query = ' OR '.join(f'"{kw}"' for kw in main_keywords)
+            print(f"Generated search query: {search_query}")
+            return search_query
+        except Exception as e:
+            print(f"Error in keyword extraction: {str(e)}")
+            return None
+    def get_recent_news(self, search_query=None):
         if not self.api_key:
             print("Cannot fetch news: No API key configured")
             return pd.DataFrame()
         try:
             # Get news from the last 7 days
             week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
+            articles = []
+            # Get top headlines first (major news events)
+            top_headlines = self.newsapi.get_top_headlines(
                 language='en',
+                page_size=10  # Limit to top 10 headlines
             )
+            if top_headlines['status'] == 'ok':
+                articles.extend(top_headlines['articles'])
+            # If we have specific keywords, search for related news
+            if search_query:
+                everything = self.newsapi.get_everything(
+                    q=search_query,
+                    from_param=week_ago,
+                    language='en',
+                    sort_by='relevancy',
+                    page_size=15  # More articles for specific searches
+                )
+                if everything['status'] == 'ok':
+                    articles.extend(everything['articles'])
+            # Extract and clean article data
+            news_data = []
+            seen_titles = set()  # To avoid duplicates
+            for article in articles:
+                title = article.get('title', '').strip()
+                desc = article.get('description', '').strip()
+                # Skip articles without title or description
+                if not title or not desc:
+                    continue
+                # Skip duplicate titles
+                if title in seen_titles:
+                    continue
+                news_data.append({
+                    'title': title,
+                    'description': desc
+                })
+                seen_titles.add(title)
+            print(f"Successfully fetched {len(news_data)} unique articles")
+            return pd.DataFrame(news_data)
         except Exception as e:
             print(f"Error fetching news: {str(e)}")
             return pd.DataFrame()
     def check_content_against_news(self, marketing_text):
+        # Extract meaningful keywords from marketing text
+        search_query = self.extract_keywords(marketing_text)
+        print(f"Using search query: {search_query}")
+        # Get news articles
+        news_df = self.get_recent_news(search_query)
         if news_df.empty:
             return {
                 'status': 'warning',
                 'message': 'Unable to check against current news context. Proceed with caution.'
             }
+        # Prepare marketing text for comparison
+        marketing_words = set(word.lower() for word in word_tokenize(marketing_text))
         potential_conflicts = []
         for _, row in news_df.iterrows():
+            title_words = set(word.lower() for word in word_tokenize(row['title']))
+            desc_words = set(word.lower() for word in word_tokenize(str(row['description'])))
+            # Calculate overlap ratios
+            title_overlap = len(marketing_words.intersection(title_words)) / len(title_words)
+            desc_overlap = len(marketing_words.intersection(desc_words)) / len(desc_words)
+            # Flag if significant overlap found
+            if title_overlap > 0.3 or desc_overlap > 0.25:  # Adjusted thresholds
                 potential_conflicts.append(row['title'])
         if potential_conflicts:
             return {
                 'status': 'warning',
+                'message': 'Potential conflicts found with current news:\n- ' +
+                          '\n- '.join(potential_conflicts[:3]) +
+                          ('\n\nAnd more...' if len(potential_conflicts) > 3 else '')
             }
         return {
             'status': 'pass',
             'message': 'No significant conflicts with current news found.'

requirements.txt CHANGED Viewed

@@ -8,5 +8,4 @@ pandas==2.1.4
 numpy==1.24.3
 requests==2.31.0
 python-dotenv==1.0.0
-sentencepiece==0.2.0
-sacremoses==0.1.1

 numpy==1.24.3
 requests==2.31.0
 python-dotenv==1.0.0
+nltk==3.8.1