Jon Solow
Add copy loader page that loads dbt
2a27c15
raw
history blame
4.6 kB
import duckdb
import pandas as pd
import os
import streamlit as st
from typing import Callable
duckdb.default_connection.execute("SET GLOBAL pandas_analyze_sample=100000")
BASE_URL = "https://github.com/nflverse/nflverse-data/releases/download/"
FANTASY_POSITIONS = [
"QB",
"RB",
"WR",
"TE",
"FB",
"K",
]
def get_snap_counts(season_int: int) -> pd.DataFrame:
df = duckdb.sql(f"SELECT * from snap_counts_snap_counts_{season_int}").df()
df["fantasy_position"] = df["position"].isin(FANTASY_POSITIONS)
return df
def get_play_by_play(season_int: int) -> pd.DataFrame:
df = duckdb.sql(f"SELECT * from pbp_play_by_play_{season_int}").df()
return df
def get_player_stats(season_int: int) -> pd.DataFrame:
df = duckdb.sql("SELECT * from player_stats_player_stats").df()
return df
def get_ftn_charting(season_int: int) -> pd.DataFrame:
df = duckdb.sql(f"SELECT * from ftn_charting_ftn_charting_{season_int}").df()
return df
def get_pbp_participation(season_int: int) -> pd.DataFrame:
df = duckdb.sql(
f"""
SELECT
a.*
, b.week
, b.down
, b.qtr
, b.ydstogo
, b.play_type
, b.pass_length
, b.pass_location
, 1 as count_col
from pbp_participation_pbp_participation_{season_int} a
left join pbp_play_by_play_{season_int} b
on a.play_id = b.play_id
and a.nflverse_game_id = b.game_id
where b.week is not null
"""
).df()
return df
def get_depth_charts(season_int: int) -> pd.DataFrame:
df = duckdb.sql(
f"""
SELECT
*
from depth_charts_depth_charts_{season_int}
"""
).df()
return df
def get_nextgen_stats(season_int: int, stat_category: str) -> pd.DataFrame:
df = duckdb.sql(f"SELECT * from nextgen_stats_ngs_{stat_category} where season = {season_int}").df()
return df
SEASON = "2024"
NFLVERSE_ASSETS = [
("ftn_charting", f"ftn_charting_{SEASON}.parquet"),
("espn_data", "qbr_season_level.parquet"),
("espn_data", "qbr_week_level.parquet"),
("players", "players.parquet"),
("pbp_participation", f"pbp_participation_{SEASON}.parquet"),
("snap_counts", f"snap_counts_{SEASON}.parquet"),
("player_stats", f"player_stats_{SEASON}.parquet"),
("player_stats", f"player_stats_def_{SEASON}.parquet"),
("player_stats", f"player_stats_kicking_{SEASON}.parquet"),
("pfr_advstats", "advstats_season_def.parquet"),
("pfr_advstats", "advstats_season_pass.parquet"),
("pfr_advstats", "advstats_season_rec.parquet"),
("pfr_advstats", "advstats_season_rush.parquet"),
("pfr_advstats", f"advstats_week_def_{SEASON}.parquet"),
("pfr_advstats", f"advstats_week_pass_{SEASON}.parquet"),
("pfr_advstats", f"advstats_week_rec_{SEASON}.parquet"),
("pfr_advstats", f"advstats_week_rush_{SEASON}.parquet"),
("pbp", f"play_by_play_{SEASON}.parquet"),
("nextgen_stats", "ngs_passing.parquet"),
("nextgen_stats", "ngs_receiving.parquet"),
("nextgen_stats", "ngs_rushing.parquet"),
("depth_charts", f"depth_charts_{SEASON}.parquet"),
]
class NflVerseDataAsset:
def __init__(
self,
release_tag: str,
asset_name: str,
dataframe_mutation_fxn: Callable[[pd.DataFrame], pd.DataFrame] = lambda x: x,
):
self.release_tag = release_tag
self.asset_name = asset_name
self.dataframe_mutation_fxn = dataframe_mutation_fxn
self.table_name = f"{release_tag}_{asset_name.rsplit('.', 1)[0]}"
def load_parquet_asset_to_df(self) -> pd.DataFrame:
location = os.path.join(BASE_URL, self.release_tag, self.asset_name)
df = pd.read_parquet(location)
return df
def register_asset_to_duckdb(self) -> None:
df = self.load_parquet_asset_to_df()
df = self.dataframe_mutation_fxn(df)
duckdb.register(self.table_name, df)
def load_assets():
for tag, asset in NFLVERSE_ASSETS:
try:
asset = NflVerseDataAsset(tag, asset)
asset.register_asset_to_duckdb()
except Exception:
st.write(f"{tag} failed to load")
def get_current_tables(duckdb_conn=duckdb) -> list[str]:
current_tables_df = duckdb_conn.sql("SHOW TABLES").df()
return current_tables_df["name"].tolist()