Ozgur Unlu
commited on
Commit
·
f73d159
1
Parent(s):
800031b
better news search test
Browse files- news_checker.py +128 -33
- requirements.txt +1 -2
news_checker.py
CHANGED
@@ -3,6 +3,11 @@ from newsapi import NewsApiClient
|
|
3 |
from dotenv import load_dotenv
|
4 |
import pandas as pd
|
5 |
from datetime import datetime, timedelta
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
load_dotenv()
|
8 |
|
@@ -16,10 +21,66 @@ class NewsChecker:
|
|
16 |
|
17 |
try:
|
18 |
self.newsapi = NewsApiClient(api_key=self.api_key)
|
|
|
|
|
|
|
|
|
|
|
19 |
except Exception as e:
|
20 |
print(f"Error initializing NewsAPI client: {str(e)}")
|
21 |
-
|
22 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
if not self.api_key:
|
24 |
print("Cannot fetch news: No API key configured")
|
25 |
return pd.DataFrame()
|
@@ -27,60 +88,94 @@ class NewsChecker:
|
|
27 |
try:
|
28 |
# Get news from the last 7 days
|
29 |
week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
33 |
language='en',
|
34 |
-
|
35 |
-
page_size=100
|
36 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
except Exception as e:
|
55 |
print(f"Error fetching news: {str(e)}")
|
56 |
return pd.DataFrame()
|
57 |
|
58 |
def check_content_against_news(self, marketing_text):
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
60 |
if news_df.empty:
|
61 |
return {
|
62 |
'status': 'warning',
|
63 |
'message': 'Unable to check against current news context. Proceed with caution.'
|
64 |
}
|
65 |
-
|
66 |
-
#
|
67 |
-
marketing_words = set(
|
68 |
potential_conflicts = []
|
69 |
|
70 |
for _, row in news_df.iterrows():
|
71 |
-
title_words = set(row['title']
|
72 |
-
desc_words = set(str(row['description'])
|
|
|
|
|
|
|
|
|
73 |
|
74 |
-
#
|
75 |
-
if
|
76 |
potential_conflicts.append(row['title'])
|
77 |
-
|
78 |
if potential_conflicts:
|
79 |
return {
|
80 |
'status': 'warning',
|
81 |
-
'message': 'Potential conflicts found with current news:\n- ' +
|
|
|
|
|
82 |
}
|
83 |
-
|
84 |
return {
|
85 |
'status': 'pass',
|
86 |
'message': 'No significant conflicts with current news found.'
|
|
|
3 |
from dotenv import load_dotenv
|
4 |
import pandas as pd
|
5 |
from datetime import datetime, timedelta
|
6 |
+
import nltk
|
7 |
+
from nltk.tokenize import word_tokenize
|
8 |
+
from nltk.tag import pos_tag
|
9 |
+
from nltk.chunk import ne_chunk
|
10 |
+
from collections import Counter
|
11 |
|
12 |
load_dotenv()
|
13 |
|
|
|
21 |
|
22 |
try:
|
23 |
self.newsapi = NewsApiClient(api_key=self.api_key)
|
24 |
+
# Download required NLTK data
|
25 |
+
nltk.download('punkt', quiet=True)
|
26 |
+
nltk.download('averaged_perceptron_tagger', quiet=True)
|
27 |
+
nltk.download('maxent_ne_chunker', quiet=True)
|
28 |
+
nltk.download('words', quiet=True)
|
29 |
except Exception as e:
|
30 |
print(f"Error initializing NewsAPI client: {str(e)}")
|
31 |
+
|
32 |
+
def extract_keywords(self, text, max_keywords=3):
|
33 |
+
"""Extract meaningful keywords from text using NLP techniques"""
|
34 |
+
try:
|
35 |
+
# Tokenize and tag parts of speech
|
36 |
+
tokens = word_tokenize(text)
|
37 |
+
tagged = pos_tag(tokens)
|
38 |
+
|
39 |
+
# Extract named entities
|
40 |
+
named_entities = []
|
41 |
+
chunks = ne_chunk(tagged)
|
42 |
+
for chunk in chunks:
|
43 |
+
if hasattr(chunk, 'label'):
|
44 |
+
named_entities.append(' '.join(c[0] for c in chunk))
|
45 |
+
|
46 |
+
# Extract nouns and adjectives (excluding common words)
|
47 |
+
common_words = {'new', 'great', 'good', 'best', 'better', 'more', 'most',
|
48 |
+
'today', 'now', 'get', 'our', 'your', 'their', 'this', 'that',
|
49 |
+
'these', 'those', 'here', 'there', 'when', 'where', 'who',
|
50 |
+
'what', 'why', 'how', 'all', 'any', 'both', 'each', 'few',
|
51 |
+
'more', 'most', 'other', 'some', 'such', 'only', 'own',
|
52 |
+
'same', 'than', 'too', 'very', 'can', 'will', 'just', 'should',
|
53 |
+
'features', 'feature', 'offers', 'offer', 'price', 'prices'}
|
54 |
+
|
55 |
+
important_words = []
|
56 |
+
for word, tag in tagged:
|
57 |
+
# NN* for nouns, JJ* for adjectives
|
58 |
+
if (tag.startswith('NN') or tag.startswith('JJ')) and \
|
59 |
+
word.lower() not in common_words and \
|
60 |
+
len(word) > 2:
|
61 |
+
important_words.append(word.lower())
|
62 |
+
|
63 |
+
# Combine named entities and important words, count frequencies
|
64 |
+
all_keywords = named_entities + important_words
|
65 |
+
keyword_freq = Counter(all_keywords)
|
66 |
+
|
67 |
+
# Get most common keywords
|
68 |
+
main_keywords = [word for word, count in keyword_freq.most_common(max_keywords)]
|
69 |
+
|
70 |
+
# If no keywords found, return None to trigger general news search
|
71 |
+
if not main_keywords:
|
72 |
+
return None
|
73 |
+
|
74 |
+
# Create search query
|
75 |
+
search_query = ' OR '.join(f'"{kw}"' for kw in main_keywords)
|
76 |
+
print(f"Generated search query: {search_query}")
|
77 |
+
return search_query
|
78 |
+
|
79 |
+
except Exception as e:
|
80 |
+
print(f"Error in keyword extraction: {str(e)}")
|
81 |
+
return None
|
82 |
+
|
83 |
+
def get_recent_news(self, search_query=None):
|
84 |
if not self.api_key:
|
85 |
print("Cannot fetch news: No API key configured")
|
86 |
return pd.DataFrame()
|
|
|
88 |
try:
|
89 |
# Get news from the last 7 days
|
90 |
week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
|
91 |
+
articles = []
|
92 |
+
|
93 |
+
# Get top headlines first (major news events)
|
94 |
+
top_headlines = self.newsapi.get_top_headlines(
|
95 |
language='en',
|
96 |
+
page_size=10 # Limit to top 10 headlines
|
|
|
97 |
)
|
98 |
+
if top_headlines['status'] == 'ok':
|
99 |
+
articles.extend(top_headlines['articles'])
|
100 |
+
|
101 |
+
# If we have specific keywords, search for related news
|
102 |
+
if search_query:
|
103 |
+
everything = self.newsapi.get_everything(
|
104 |
+
q=search_query,
|
105 |
+
from_param=week_ago,
|
106 |
+
language='en',
|
107 |
+
sort_by='relevancy',
|
108 |
+
page_size=15 # More articles for specific searches
|
109 |
+
)
|
110 |
+
if everything['status'] == 'ok':
|
111 |
+
articles.extend(everything['articles'])
|
112 |
|
113 |
+
# Extract and clean article data
|
114 |
+
news_data = []
|
115 |
+
seen_titles = set() # To avoid duplicates
|
116 |
+
|
117 |
+
for article in articles:
|
118 |
+
title = article.get('title', '').strip()
|
119 |
+
desc = article.get('description', '').strip()
|
120 |
+
|
121 |
+
# Skip articles without title or description
|
122 |
+
if not title or not desc:
|
123 |
+
continue
|
124 |
+
|
125 |
+
# Skip duplicate titles
|
126 |
+
if title in seen_titles:
|
127 |
+
continue
|
128 |
+
|
129 |
+
news_data.append({
|
130 |
+
'title': title,
|
131 |
+
'description': desc
|
132 |
+
})
|
133 |
+
seen_titles.add(title)
|
134 |
+
|
135 |
+
print(f"Successfully fetched {len(news_data)} unique articles")
|
136 |
+
return pd.DataFrame(news_data)
|
137 |
|
138 |
except Exception as e:
|
139 |
print(f"Error fetching news: {str(e)}")
|
140 |
return pd.DataFrame()
|
141 |
|
142 |
def check_content_against_news(self, marketing_text):
|
143 |
+
# Extract meaningful keywords from marketing text
|
144 |
+
search_query = self.extract_keywords(marketing_text)
|
145 |
+
print(f"Using search query: {search_query}")
|
146 |
+
|
147 |
+
# Get news articles
|
148 |
+
news_df = self.get_recent_news(search_query)
|
149 |
if news_df.empty:
|
150 |
return {
|
151 |
'status': 'warning',
|
152 |
'message': 'Unable to check against current news context. Proceed with caution.'
|
153 |
}
|
154 |
+
|
155 |
+
# Prepare marketing text for comparison
|
156 |
+
marketing_words = set(word.lower() for word in word_tokenize(marketing_text))
|
157 |
potential_conflicts = []
|
158 |
|
159 |
for _, row in news_df.iterrows():
|
160 |
+
title_words = set(word.lower() for word in word_tokenize(row['title']))
|
161 |
+
desc_words = set(word.lower() for word in word_tokenize(str(row['description'])))
|
162 |
+
|
163 |
+
# Calculate overlap ratios
|
164 |
+
title_overlap = len(marketing_words.intersection(title_words)) / len(title_words)
|
165 |
+
desc_overlap = len(marketing_words.intersection(desc_words)) / len(desc_words)
|
166 |
|
167 |
+
# Flag if significant overlap found
|
168 |
+
if title_overlap > 0.3 or desc_overlap > 0.25: # Adjusted thresholds
|
169 |
potential_conflicts.append(row['title'])
|
170 |
+
|
171 |
if potential_conflicts:
|
172 |
return {
|
173 |
'status': 'warning',
|
174 |
+
'message': 'Potential conflicts found with current news:\n- ' +
|
175 |
+
'\n- '.join(potential_conflicts[:3]) +
|
176 |
+
('\n\nAnd more...' if len(potential_conflicts) > 3 else '')
|
177 |
}
|
178 |
+
|
179 |
return {
|
180 |
'status': 'pass',
|
181 |
'message': 'No significant conflicts with current news found.'
|
requirements.txt
CHANGED
@@ -8,5 +8,4 @@ pandas==2.1.4
|
|
8 |
numpy==1.24.3
|
9 |
requests==2.31.0
|
10 |
python-dotenv==1.0.0
|
11 |
-
|
12 |
-
sacremoses==0.1.1
|
|
|
8 |
numpy==1.24.3
|
9 |
requests==2.31.0
|
10 |
python-dotenv==1.0.0
|
11 |
+
nltk==3.8.1
|
|