# fetch_news.py import aiohttp import feedparser import asyncio from datetime import datetime, timedelta from config import load_feeds, load_api_keys, SETTINGS import logging from urllib.parse import urlparse, urlunparse logger = logging.getLogger(__name__) class NewsFetcher: def __init__(self): self.feeds = load_feeds() self.api_keys = load_api_keys() self.session = None async def __aenter__(self): self.session = aiohttp.ClientSession( timeout=aiohttp.ClientTimeout(total=SETTINGS["request_timeout"]), headers={"User-Agent": "NewsBot/1.0"} ) return self async def __aexit__(self, *exc): await self.session.close() self.session = None async def fetch(self, url: str, source_type: str = "rss") -> Union[str, Dict, None]: retries = 0 while retries < SETTINGS["max_retries"]: try: async with self.session.get(url) as response: if response.status == 200: return await response.json() if source_type == "newsapi" else await response.text() logger.warning(f"Fehler beim Abrufen von {url}: Status {response.status}") except Exception as e: logger.error(f"Verbindungsfehler: {str(e)}") retries += 1 await asyncio.sleep(5) return None async def process_rss(self, feed_config: Dict, processed_links: set) -> List[Dict]: content = await self.fetch(feed_config["url"]) if not content: return [] feed = feedparser.parse(content) articles = [] cutoff_time = datetime.now() - timedelta(hours=24) for entry in feed.entries[:SETTINGS["max_articles"]]: try: pub_date = datetime(*entry.published_parsed[:6]) if hasattr(entry, 'published_parsed') else datetime.now() if pub_date < cutoff_time: continue article = { "title": entry.title, "link": self.normalize_url(entry.link), "source": feed_config["name"], "description": entry.get("summary", "")[:500], "published": pub_date, "category": feed_config.get("category", "general") } if article["link"] not in processed_links: articles.append(article) processed_links.add(article["link"]) except Exception as e: logger.error(f"Fehler beim Verarbeiten des Artikels: {str(e)}") return articles async def get_news_api(self, feed_config: Dict, processed_links: set) -> List[Dict]: api_key = self.api_keys.get("newsapi") if not api_key: logger.error("NewsAPI-Key fehlt!") return [] url = f"https://newsapi.org/v2/everything?q={feed_config['query']}&pageSize={SETTINGS['newsapi_page_size']}&apiKey={api_key}" data = await self.fetch(url, "newsapi") if not data: return [] articles = [] cutoff_time = datetime.now() - timedelta(hours=24) for article in data.get("articles", []): try: pub_date = datetime.fromisoformat(article["publishedAt"].rstrip("Z")) if pub_date < cutoff_time: continue entry = { "title": article["title"], "link": self.normalize_url(article["url"]), "source": feed_config["name"], "description": article.get("description", "")[:500], "published": pub_date, "category": feed_config.get("category", "general") } if entry["link"] not in processed_links: articles.append(entry) processed_links.add(entry["link"]) except Exception as e: logger.error(f"Fehler bei NewsAPI-Artikel: {str(e)}") return articles def normalize_url(self, url: str) -> str: parsed = urlparse(url) return urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))