Ozgur Unlu commited on
Commit
f73d159
·
1 Parent(s): 800031b

better news search test

Browse files
Files changed (2) hide show
  1. news_checker.py +128 -33
  2. requirements.txt +1 -2
news_checker.py CHANGED
@@ -3,6 +3,11 @@ from newsapi import NewsApiClient
3
  from dotenv import load_dotenv
4
  import pandas as pd
5
  from datetime import datetime, timedelta
 
 
 
 
 
6
 
7
  load_dotenv()
8
 
@@ -16,10 +21,66 @@ class NewsChecker:
16
 
17
  try:
18
  self.newsapi = NewsApiClient(api_key=self.api_key)
 
 
 
 
 
19
  except Exception as e:
20
  print(f"Error initializing NewsAPI client: {str(e)}")
21
-
22
- def get_recent_news(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  if not self.api_key:
24
  print("Cannot fetch news: No API key configured")
25
  return pd.DataFrame()
@@ -27,60 +88,94 @@ class NewsChecker:
27
  try:
28
  # Get news from the last 7 days
29
  week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
30
- response = self.newsapi.get_everything(
31
- q='',
32
- from_param=week_ago,
 
33
  language='en',
34
- sort_by='relevancy',
35
- page_size=100
36
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- if response['status'] == 'ok':
39
- articles = response['articles']
40
- # Extract titles and descriptions
41
- news_data = [
42
- {
43
- 'title': article['title'],
44
- 'description': article['description']
45
- }
46
- for article in articles if article['description']
47
- ]
48
- print(f"Successfully fetched {len(news_data)} articles")
49
- return pd.DataFrame(news_data)
50
- else:
51
- print(f"NewsAPI response status was not 'ok': {response.get('status')}")
52
- return pd.DataFrame()
 
 
 
 
 
 
 
 
 
53
 
54
  except Exception as e:
55
  print(f"Error fetching news: {str(e)}")
56
  return pd.DataFrame()
57
 
58
  def check_content_against_news(self, marketing_text):
59
- news_df = self.get_recent_news()
 
 
 
 
 
60
  if news_df.empty:
61
  return {
62
  'status': 'warning',
63
  'message': 'Unable to check against current news context. Proceed with caution.'
64
  }
65
-
66
- # Simple keyword matching for demo purposes
67
- marketing_words = set(marketing_text.lower().split())
68
  potential_conflicts = []
69
 
70
  for _, row in news_df.iterrows():
71
- title_words = set(row['title'].lower().split())
72
- desc_words = set(str(row['description']).lower().split())
 
 
 
 
73
 
74
- # Check for significant word overlap
75
- if len(marketing_words.intersection(title_words)) >= 3:
76
  potential_conflicts.append(row['title'])
77
-
78
  if potential_conflicts:
79
  return {
80
  'status': 'warning',
81
- 'message': 'Potential conflicts found with current news:\n- ' + '\n- '.join(potential_conflicts)
 
 
82
  }
83
-
84
  return {
85
  'status': 'pass',
86
  'message': 'No significant conflicts with current news found.'
 
3
  from dotenv import load_dotenv
4
  import pandas as pd
5
  from datetime import datetime, timedelta
6
+ import nltk
7
+ from nltk.tokenize import word_tokenize
8
+ from nltk.tag import pos_tag
9
+ from nltk.chunk import ne_chunk
10
+ from collections import Counter
11
 
12
  load_dotenv()
13
 
 
21
 
22
  try:
23
  self.newsapi = NewsApiClient(api_key=self.api_key)
24
+ # Download required NLTK data
25
+ nltk.download('punkt', quiet=True)
26
+ nltk.download('averaged_perceptron_tagger', quiet=True)
27
+ nltk.download('maxent_ne_chunker', quiet=True)
28
+ nltk.download('words', quiet=True)
29
  except Exception as e:
30
  print(f"Error initializing NewsAPI client: {str(e)}")
31
+
32
+ def extract_keywords(self, text, max_keywords=3):
33
+ """Extract meaningful keywords from text using NLP techniques"""
34
+ try:
35
+ # Tokenize and tag parts of speech
36
+ tokens = word_tokenize(text)
37
+ tagged = pos_tag(tokens)
38
+
39
+ # Extract named entities
40
+ named_entities = []
41
+ chunks = ne_chunk(tagged)
42
+ for chunk in chunks:
43
+ if hasattr(chunk, 'label'):
44
+ named_entities.append(' '.join(c[0] for c in chunk))
45
+
46
+ # Extract nouns and adjectives (excluding common words)
47
+ common_words = {'new', 'great', 'good', 'best', 'better', 'more', 'most',
48
+ 'today', 'now', 'get', 'our', 'your', 'their', 'this', 'that',
49
+ 'these', 'those', 'here', 'there', 'when', 'where', 'who',
50
+ 'what', 'why', 'how', 'all', 'any', 'both', 'each', 'few',
51
+ 'more', 'most', 'other', 'some', 'such', 'only', 'own',
52
+ 'same', 'than', 'too', 'very', 'can', 'will', 'just', 'should',
53
+ 'features', 'feature', 'offers', 'offer', 'price', 'prices'}
54
+
55
+ important_words = []
56
+ for word, tag in tagged:
57
+ # NN* for nouns, JJ* for adjectives
58
+ if (tag.startswith('NN') or tag.startswith('JJ')) and \
59
+ word.lower() not in common_words and \
60
+ len(word) > 2:
61
+ important_words.append(word.lower())
62
+
63
+ # Combine named entities and important words, count frequencies
64
+ all_keywords = named_entities + important_words
65
+ keyword_freq = Counter(all_keywords)
66
+
67
+ # Get most common keywords
68
+ main_keywords = [word for word, count in keyword_freq.most_common(max_keywords)]
69
+
70
+ # If no keywords found, return None to trigger general news search
71
+ if not main_keywords:
72
+ return None
73
+
74
+ # Create search query
75
+ search_query = ' OR '.join(f'"{kw}"' for kw in main_keywords)
76
+ print(f"Generated search query: {search_query}")
77
+ return search_query
78
+
79
+ except Exception as e:
80
+ print(f"Error in keyword extraction: {str(e)}")
81
+ return None
82
+
83
+ def get_recent_news(self, search_query=None):
84
  if not self.api_key:
85
  print("Cannot fetch news: No API key configured")
86
  return pd.DataFrame()
 
88
  try:
89
  # Get news from the last 7 days
90
  week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
91
+ articles = []
92
+
93
+ # Get top headlines first (major news events)
94
+ top_headlines = self.newsapi.get_top_headlines(
95
  language='en',
96
+ page_size=10 # Limit to top 10 headlines
 
97
  )
98
+ if top_headlines['status'] == 'ok':
99
+ articles.extend(top_headlines['articles'])
100
+
101
+ # If we have specific keywords, search for related news
102
+ if search_query:
103
+ everything = self.newsapi.get_everything(
104
+ q=search_query,
105
+ from_param=week_ago,
106
+ language='en',
107
+ sort_by='relevancy',
108
+ page_size=15 # More articles for specific searches
109
+ )
110
+ if everything['status'] == 'ok':
111
+ articles.extend(everything['articles'])
112
 
113
+ # Extract and clean article data
114
+ news_data = []
115
+ seen_titles = set() # To avoid duplicates
116
+
117
+ for article in articles:
118
+ title = article.get('title', '').strip()
119
+ desc = article.get('description', '').strip()
120
+
121
+ # Skip articles without title or description
122
+ if not title or not desc:
123
+ continue
124
+
125
+ # Skip duplicate titles
126
+ if title in seen_titles:
127
+ continue
128
+
129
+ news_data.append({
130
+ 'title': title,
131
+ 'description': desc
132
+ })
133
+ seen_titles.add(title)
134
+
135
+ print(f"Successfully fetched {len(news_data)} unique articles")
136
+ return pd.DataFrame(news_data)
137
 
138
  except Exception as e:
139
  print(f"Error fetching news: {str(e)}")
140
  return pd.DataFrame()
141
 
142
  def check_content_against_news(self, marketing_text):
143
+ # Extract meaningful keywords from marketing text
144
+ search_query = self.extract_keywords(marketing_text)
145
+ print(f"Using search query: {search_query}")
146
+
147
+ # Get news articles
148
+ news_df = self.get_recent_news(search_query)
149
  if news_df.empty:
150
  return {
151
  'status': 'warning',
152
  'message': 'Unable to check against current news context. Proceed with caution.'
153
  }
154
+
155
+ # Prepare marketing text for comparison
156
+ marketing_words = set(word.lower() for word in word_tokenize(marketing_text))
157
  potential_conflicts = []
158
 
159
  for _, row in news_df.iterrows():
160
+ title_words = set(word.lower() for word in word_tokenize(row['title']))
161
+ desc_words = set(word.lower() for word in word_tokenize(str(row['description'])))
162
+
163
+ # Calculate overlap ratios
164
+ title_overlap = len(marketing_words.intersection(title_words)) / len(title_words)
165
+ desc_overlap = len(marketing_words.intersection(desc_words)) / len(desc_words)
166
 
167
+ # Flag if significant overlap found
168
+ if title_overlap > 0.3 or desc_overlap > 0.25: # Adjusted thresholds
169
  potential_conflicts.append(row['title'])
170
+
171
  if potential_conflicts:
172
  return {
173
  'status': 'warning',
174
+ 'message': 'Potential conflicts found with current news:\n- ' +
175
+ '\n- '.join(potential_conflicts[:3]) +
176
+ ('\n\nAnd more...' if len(potential_conflicts) > 3 else '')
177
  }
178
+
179
  return {
180
  'status': 'pass',
181
  'message': 'No significant conflicts with current news found.'
requirements.txt CHANGED
@@ -8,5 +8,4 @@ pandas==2.1.4
8
  numpy==1.24.3
9
  requests==2.31.0
10
  python-dotenv==1.0.0
11
- sentencepiece==0.2.0
12
- sacremoses==0.1.1
 
8
  numpy==1.24.3
9
  requests==2.31.0
10
  python-dotenv==1.0.0
11
+ nltk==3.8.1