Dunevhhhh commited on
Commit
b58bf24
·
verified ·
1 Parent(s): e6b49ad

Create fetch_news.py

Browse files
Files changed (1) hide show
  1. fetch_news.py +112 -0
fetch_news.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # fetch_news.py
2
+ import aiohttp
3
+ import feedparser
4
+ import asyncio
5
+ from datetime import datetime, timedelta
6
+ from config import load_feeds, load_api_keys, SETTINGS
7
+ import logging
8
+ from urllib.parse import urlparse, urlunparse
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class NewsFetcher:
13
+ def __init__(self):
14
+ self.feeds = load_feeds()
15
+ self.api_keys = load_api_keys()
16
+ self.session = None
17
+
18
+ async def __aenter__(self):
19
+ self.session = aiohttp.ClientSession(
20
+ timeout=aiohttp.ClientTimeout(total=SETTINGS["request_timeout"]),
21
+ headers={"User-Agent": "NewsBot/1.0"}
22
+ )
23
+ return self
24
+
25
+ async def __aexit__(self, *exc):
26
+ await self.session.close()
27
+ self.session = None
28
+
29
+ async def fetch(self, url: str, source_type: str = "rss") -> Union[str, Dict, None]:
30
+ retries = 0
31
+ while retries < SETTINGS["max_retries"]:
32
+ try:
33
+ async with self.session.get(url) as response:
34
+ if response.status == 200:
35
+ return await response.json() if source_type == "newsapi" else await response.text()
36
+ logger.warning(f"Fehler beim Abrufen von {url}: Status {response.status}")
37
+ except Exception as e:
38
+ logger.error(f"Verbindungsfehler: {str(e)}")
39
+ retries += 1
40
+ await asyncio.sleep(5)
41
+ return None
42
+
43
+ async def process_rss(self, feed_config: Dict, processed_links: set) -> List[Dict]:
44
+ content = await self.fetch(feed_config["url"])
45
+ if not content:
46
+ return []
47
+
48
+ feed = feedparser.parse(content)
49
+ articles = []
50
+ cutoff_time = datetime.now() - timedelta(hours=24)
51
+
52
+ for entry in feed.entries[:SETTINGS["max_articles"]]:
53
+ try:
54
+ pub_date = datetime(*entry.published_parsed[:6]) if hasattr(entry, 'published_parsed') else datetime.now()
55
+ if pub_date < cutoff_time:
56
+ continue
57
+
58
+ article = {
59
+ "title": entry.title,
60
+ "link": self.normalize_url(entry.link),
61
+ "source": feed_config["name"],
62
+ "description": entry.get("summary", "")[:500],
63
+ "published": pub_date,
64
+ "category": feed_config.get("category", "general")
65
+ }
66
+
67
+ if article["link"] not in processed_links:
68
+ articles.append(article)
69
+ processed_links.add(article["link"])
70
+ except Exception as e:
71
+ logger.error(f"Fehler beim Verarbeiten des Artikels: {str(e)}")
72
+ return articles
73
+
74
+ async def get_news_api(self, feed_config: Dict, processed_links: set) -> List[Dict]:
75
+ api_key = self.api_keys.get("newsapi")
76
+ if not api_key:
77
+ logger.error("NewsAPI-Key fehlt!")
78
+ return []
79
+
80
+ url = f"https://newsapi.org/v2/everything?q={feed_config['query']}&pageSize={SETTINGS['newsapi_page_size']}&apiKey={api_key}"
81
+ data = await self.fetch(url, "newsapi")
82
+ if not data:
83
+ return []
84
+
85
+ articles = []
86
+ cutoff_time = datetime.now() - timedelta(hours=24)
87
+
88
+ for article in data.get("articles", []):
89
+ try:
90
+ pub_date = datetime.fromisoformat(article["publishedAt"].rstrip("Z"))
91
+ if pub_date < cutoff_time:
92
+ continue
93
+
94
+ entry = {
95
+ "title": article["title"],
96
+ "link": self.normalize_url(article["url"]),
97
+ "source": feed_config["name"],
98
+ "description": article.get("description", "")[:500],
99
+ "published": pub_date,
100
+ "category": feed_config.get("category", "general")
101
+ }
102
+
103
+ if entry["link"] not in processed_links:
104
+ articles.append(entry)
105
+ processed_links.add(entry["link"])
106
+ except Exception as e:
107
+ logger.error(f"Fehler bei NewsAPI-Artikel: {str(e)}")
108
+ return articles
109
+
110
+ def normalize_url(self, url: str) -> str:
111
+ parsed = urlparse(url)
112
+ return urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))