|
|
|
import re |
|
from config import CATEGORY_FILTERS |
|
import logging |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
class NewsFilter: |
|
def __init__(self): |
|
self.keyword_filters = CATEGORY_FILTERS |
|
self.blacklist = self.load_wordlist("blacklist.txt") |
|
self.whitelist = self.load_wordlist("whitelist.txt") |
|
|
|
def load_wordlist(self, filename): |
|
try: |
|
with open(f"config/{filename}") as f: |
|
return [line.strip().lower() for line in f if line.strip()] |
|
except FileNotFoundError: |
|
return [] |
|
|
|
def filter_articles(self, articles): |
|
filtered = [] |
|
for article in articles: |
|
if self.is_blacklisted(article) and not self.is_whitelisted(article): |
|
continue |
|
filtered.append(article) |
|
return self.prioritize_articles(filtered) |
|
|
|
def is_blacklisted(self, article): |
|
text = f"{article['title']} {article['description']}".lower() |
|
return any(re.search(rf'\b{word}\b', text) for word in self.blacklist) |
|
|
|
def is_whitelisted(self, article): |
|
text = f"{article['title']} {article['description']}".lower() |
|
return any(re.search(rf'\b{word}\b', text) for word in self.whitelist) |
|
|
|
def prioritize_articles(self, articles): |
|
|
|
def sort_key(article): |
|
score = 0 |
|
category_keywords = self.keyword_filters.get(article["category"], []) |
|
text = f"{article['title']} {article['description']}".lower() |
|
for keyword in category_keywords: |
|
if re.search(rf'\b{keyword.lower()}\b', text): |
|
score += 1 |
|
return -score |
|
return sorted(articles, key=sort_key) |