Create fetch_news.py
Browse files- fetch_news.py +112 -0
fetch_news.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# fetch_news.py
|
2 |
+
import aiohttp
|
3 |
+
import feedparser
|
4 |
+
import asyncio
|
5 |
+
from datetime import datetime, timedelta
|
6 |
+
from config import load_feeds, load_api_keys, SETTINGS
|
7 |
+
import logging
|
8 |
+
from urllib.parse import urlparse, urlunparse
|
9 |
+
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
class NewsFetcher:
|
13 |
+
def __init__(self):
|
14 |
+
self.feeds = load_feeds()
|
15 |
+
self.api_keys = load_api_keys()
|
16 |
+
self.session = None
|
17 |
+
|
18 |
+
async def __aenter__(self):
|
19 |
+
self.session = aiohttp.ClientSession(
|
20 |
+
timeout=aiohttp.ClientTimeout(total=SETTINGS["request_timeout"]),
|
21 |
+
headers={"User-Agent": "NewsBot/1.0"}
|
22 |
+
)
|
23 |
+
return self
|
24 |
+
|
25 |
+
async def __aexit__(self, *exc):
|
26 |
+
await self.session.close()
|
27 |
+
self.session = None
|
28 |
+
|
29 |
+
async def fetch(self, url: str, source_type: str = "rss") -> Union[str, Dict, None]:
|
30 |
+
retries = 0
|
31 |
+
while retries < SETTINGS["max_retries"]:
|
32 |
+
try:
|
33 |
+
async with self.session.get(url) as response:
|
34 |
+
if response.status == 200:
|
35 |
+
return await response.json() if source_type == "newsapi" else await response.text()
|
36 |
+
logger.warning(f"Fehler beim Abrufen von {url}: Status {response.status}")
|
37 |
+
except Exception as e:
|
38 |
+
logger.error(f"Verbindungsfehler: {str(e)}")
|
39 |
+
retries += 1
|
40 |
+
await asyncio.sleep(5)
|
41 |
+
return None
|
42 |
+
|
43 |
+
async def process_rss(self, feed_config: Dict, processed_links: set) -> List[Dict]:
|
44 |
+
content = await self.fetch(feed_config["url"])
|
45 |
+
if not content:
|
46 |
+
return []
|
47 |
+
|
48 |
+
feed = feedparser.parse(content)
|
49 |
+
articles = []
|
50 |
+
cutoff_time = datetime.now() - timedelta(hours=24)
|
51 |
+
|
52 |
+
for entry in feed.entries[:SETTINGS["max_articles"]]:
|
53 |
+
try:
|
54 |
+
pub_date = datetime(*entry.published_parsed[:6]) if hasattr(entry, 'published_parsed') else datetime.now()
|
55 |
+
if pub_date < cutoff_time:
|
56 |
+
continue
|
57 |
+
|
58 |
+
article = {
|
59 |
+
"title": entry.title,
|
60 |
+
"link": self.normalize_url(entry.link),
|
61 |
+
"source": feed_config["name"],
|
62 |
+
"description": entry.get("summary", "")[:500],
|
63 |
+
"published": pub_date,
|
64 |
+
"category": feed_config.get("category", "general")
|
65 |
+
}
|
66 |
+
|
67 |
+
if article["link"] not in processed_links:
|
68 |
+
articles.append(article)
|
69 |
+
processed_links.add(article["link"])
|
70 |
+
except Exception as e:
|
71 |
+
logger.error(f"Fehler beim Verarbeiten des Artikels: {str(e)}")
|
72 |
+
return articles
|
73 |
+
|
74 |
+
async def get_news_api(self, feed_config: Dict, processed_links: set) -> List[Dict]:
|
75 |
+
api_key = self.api_keys.get("newsapi")
|
76 |
+
if not api_key:
|
77 |
+
logger.error("NewsAPI-Key fehlt!")
|
78 |
+
return []
|
79 |
+
|
80 |
+
url = f"https://newsapi.org/v2/everything?q={feed_config['query']}&pageSize={SETTINGS['newsapi_page_size']}&apiKey={api_key}"
|
81 |
+
data = await self.fetch(url, "newsapi")
|
82 |
+
if not data:
|
83 |
+
return []
|
84 |
+
|
85 |
+
articles = []
|
86 |
+
cutoff_time = datetime.now() - timedelta(hours=24)
|
87 |
+
|
88 |
+
for article in data.get("articles", []):
|
89 |
+
try:
|
90 |
+
pub_date = datetime.fromisoformat(article["publishedAt"].rstrip("Z"))
|
91 |
+
if pub_date < cutoff_time:
|
92 |
+
continue
|
93 |
+
|
94 |
+
entry = {
|
95 |
+
"title": article["title"],
|
96 |
+
"link": self.normalize_url(article["url"]),
|
97 |
+
"source": feed_config["name"],
|
98 |
+
"description": article.get("description", "")[:500],
|
99 |
+
"published": pub_date,
|
100 |
+
"category": feed_config.get("category", "general")
|
101 |
+
}
|
102 |
+
|
103 |
+
if entry["link"] not in processed_links:
|
104 |
+
articles.append(entry)
|
105 |
+
processed_links.add(entry["link"])
|
106 |
+
except Exception as e:
|
107 |
+
logger.error(f"Fehler bei NewsAPI-Artikel: {str(e)}")
|
108 |
+
return articles
|
109 |
+
|
110 |
+
def normalize_url(self, url: str) -> str:
|
111 |
+
parsed = urlparse(url)
|
112 |
+
return urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
|