from bs4 import BeautifulSoup import datetime import pandas as pd import requests from typing import Mapping NEWS_URL = "https://www.nbcsports.com/fantasy/football/player-news" def find_soup_text_with_default(soup, element: str, find_search_map: Mapping[str, str]): find_result = soup.find(element, find_search_map) if not find_result: return "" return find_result.text.strip() def parse_player_div(player_div): return { "Date/Time": player_div.find("div", {"class": "PlayerNewsPost-date"}).get("data-date"), "Name": find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-name"}), "Team": find_soup_text_with_default(player_div, "span", {"class": "PlayerNewsPost-team-abbr"}).upper(), "Position": find_soup_text_with_default(player_div, "span", {"class": "PlayerNewsPost-position"}).title(), "Headline": find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-headline"}), "Analysis": find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-analysis"}), } def get_nfl_player_news(page_number: int = 1) -> pd.DataFrame: url = f"{NEWS_URL}?p={page_number}" request_page = requests.get(url) soup = BeautifulSoup(request_page.content) player_div_list = soup.find_all("div", {"class": "PlayerNewsPost"}) if not player_div_list: return pd.DataFrame() parsed_player_list = [parse_player_div(d) for d in player_div_list] df = pd.DataFrame(parsed_player_list) df["Date/Time"] = pd.to_datetime(df["Date/Time"]) return df def get_player_news_window_hours(hours: int = 1): end_date = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(hours=hours) page = 1 max_pages = 20 date_reached = False df_list = [] while page < max_pages and not date_reached: last_news = get_nfl_player_news(page) df_list.append(last_news) date_reached = min(last_news["Date/Time"]) < end_date page += 1 return pd.concat(df_list)