Dunevhhhh commited on
Commit
f256f51
·
verified ·
1 Parent(s): 69174a5

Create filter_news.py

Browse files
Files changed (1) hide show
  1. filter_news.py +47 -0
filter_news.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # filter_news.py (erweitert)
2
+ import re
3
+ from config import CATEGORY_FILTERS
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ class NewsFilter:
9
+ def __init__(self):
10
+ self.keyword_filters = CATEGORY_FILTERS
11
+ self.blacklist = self.load_wordlist("blacklist.txt")
12
+ self.whitelist = self.load_wordlist("whitelist.txt")
13
+
14
+ def load_wordlist(self, filename):
15
+ try:
16
+ with open(f"config/{filename}") as f:
17
+ return [line.strip().lower() for line in f if line.strip()]
18
+ except FileNotFoundError:
19
+ return []
20
+
21
+ def filter_articles(self, articles):
22
+ filtered = []
23
+ for article in articles:
24
+ if self.is_blacklisted(article) and not self.is_whitelisted(article):
25
+ continue
26
+ filtered.append(article)
27
+ return self.prioritize_articles(filtered)
28
+
29
+ def is_blacklisted(self, article):
30
+ text = f"{article['title']} {article['description']}".lower()
31
+ return any(re.search(rf'\b{word}\b', text) for word in self.blacklist)
32
+
33
+ def is_whitelisted(self, article):
34
+ text = f"{article['title']} {article['description']}".lower()
35
+ return any(re.search(rf'\b{word}\b', text) for word in self.whitelist)
36
+
37
+ def prioritize_articles(self, articles):
38
+ # Priorisierung nach Kategorie und Keywords
39
+ def sort_key(article):
40
+ score = 0
41
+ category_keywords = self.keyword_filters.get(article["category"], [])
42
+ text = f"{article['title']} {article['description']}".lower()
43
+ for keyword in category_keywords:
44
+ if re.search(rf'\b{keyword.lower()}\b', text):
45
+ score += 1
46
+ return -score # Höhere Priorität zuerst
47
+ return sorted(articles, key=sort_key)