|
from datetime import datetime |
|
|
|
import streamlit as st |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
|
|
|
|
from urllib.parse import quote |
|
|
|
|
|
def aggregated_data(df, aggregation_level="week"): |
|
|
|
st.write(f"Aggregated data by {aggregation_level}") |
|
|
|
|
|
df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0) |
|
|
|
|
|
freq = 'W' if aggregation_level == "week" else 'ME' |
|
weekly_total_papers = df.resample(freq).size() |
|
weekly_papers_with_artifacts = df.resample(freq)['has_artifact'].sum() |
|
|
|
|
|
percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100 |
|
|
|
|
|
growth_rate = percentage_papers_with_artifacts.pct_change() * 100 |
|
|
|
|
|
latest_growth_rate = growth_rate.iloc[-1] if not growth_rate.empty else 0 |
|
st.metric(label=f"{aggregation_level.capitalize()}ly Growth Rate", value=f"{latest_growth_rate:.2f}%") |
|
|
|
|
|
plt.figure(figsize=(12, 6)) |
|
plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact') |
|
|
|
|
|
plt.ylim(0, 100) |
|
|
|
plt.xlabel(aggregation_level) |
|
plt.ylabel('Percentage') |
|
plt.title('Percentage of Papers with Artifacts (Models, Datasets, Spaces) Over Time') |
|
plt.legend() |
|
plt.grid(True) |
|
|
|
|
|
st.pyplot(plt) |
|
|
|
|
|
def display_data(df): |
|
df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0) |
|
num_artifacts = df['has_artifact'].sum() |
|
percentage_of_at_least_one_artifact = num_artifacts / df.shape[0] if df.shape[0] > 0 else 0 |
|
percentage_of_at_least_one_artifact = round(percentage_of_at_least_one_artifact * 100, 2) |
|
|
|
|
|
df['reached_out'] = [False for _ in range(df.shape[0])] |
|
|
|
st.markdown(f""" |
|
## {percentage_of_at_least_one_artifact}% papers with at least one 🤗 artifact |
|
|
|
* Number of papers: {df.shape[0]} |
|
* Number of papers with a Github link: {df['github'].notnull().sum()} |
|
* Number of papers with at least one HF artifact: {num_artifacts} |
|
""") |
|
|
|
st.write("Papers with at least one artifact") |
|
st.data_editor(df[df['has_artifact']], |
|
hide_index=True, |
|
column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"), |
|
column_config={"github": st.column_config.LinkColumn(), |
|
"paper_page": st.column_config.LinkColumn(), |
|
"paper_page_with_title": st.column_config.LinkColumn(display_text=r'\|(.*)')}, |
|
width=2000, |
|
key="papers_with_artifacts") |
|
|
|
st.write("Papers without artifacts") |
|
st.data_editor(df[~df['has_artifact']], |
|
hide_index=True, |
|
column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"), |
|
column_config={"github": st.column_config.LinkColumn(), |
|
"paper_page": st.column_config.LinkColumn()}, |
|
width=2000, |
|
key="papers_without_artifacts") |
|
|
|
st.write("Papers with a HF mention in README but no artifacts") |
|
st.data_editor(df[(df['hf_mention'] == 1) & (~df['has_artifact'])], |
|
hide_index=True, |
|
column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"), |
|
column_config={"github": st.column_config.LinkColumn(), |
|
"paper_page": st.column_config.LinkColumn()}, |
|
width=2000, |
|
key="papers_with_hf_mention_no_artifacts") |
|
|
|
|
|
def main(): |
|
st.title("Hugging Face Artifacts KPI Dashboard") |
|
|
|
|
|
st.sidebar.title("Navigation") |
|
selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"]) |
|
|
|
|
|
|
|
df = pd.read_csv('/Users/nielsrogge/Downloads/daily_papers_enriched (3).csv') |
|
df = df.drop(['Unnamed: 0'], axis=1) if 'Unnamed: 0' in df.columns else df |
|
|
|
df = df.set_index('date') |
|
df.index = pd.to_datetime(df.index) |
|
df = df.sort_index() |
|
|
|
|
|
df['updated_url'] = df.apply(lambda row: f'{row["paper_page"]}/title/{quote(row["title"])}', axis=1) |
|
|
|
if selection == "Daily/weekly/monthly data": |
|
|
|
|
|
view_level = st.selectbox(label="View data per day, week or month", options=["day", "week", "month"]) |
|
|
|
if view_level == "day": |
|
|
|
day = st.date_input("Select day", value="today", format="DD/MM/YYYY") |
|
|
|
day = pd.Timestamp(day) |
|
|
|
print("Day:", day) |
|
|
|
df = df[df.index.date == day.date()] |
|
|
|
st.write(f"Showing data for {day.day_name()} {day.strftime('%d/%m/%Y')}") |
|
|
|
display_data(df) |
|
|
|
elif view_level == "week": |
|
|
|
week_number = st.number_input("Select week", value=datetime.today().isocalendar()[1], min_value=1, max_value=52) |
|
|
|
|
|
df['week'] = df.index.isocalendar().week |
|
|
|
|
|
df = df[df['week'] == week_number] |
|
|
|
st.write(f"Showing data for week {week_number}") |
|
|
|
display_data(df) |
|
|
|
elif view_level == "month": |
|
|
|
month_str = st.selectbox("Select month", options=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]) |
|
year_str = st.selectbox("Select year", options=["2024"]) |
|
|
|
|
|
month_map = { |
|
'January': 1, 'February': 2, 'March': 3, 'April': 4, |
|
'May': 5, 'June': 6, 'July': 7, 'August': 8, |
|
'September': 9, 'October': 10, 'November': 11, 'December': 12 |
|
} |
|
|
|
|
|
month = month_map[month_str] |
|
year = int(year_str) |
|
df = df[(df.index.month == month) & (df.index.year == year)] |
|
|
|
st.write(f"Showing data for {month_str} {year_str}") |
|
|
|
display_data(df) |
|
|
|
elif selection == "Aggregated data": |
|
aggregated_data(df) |
|
aggregated_data(df, aggregation_level="month") |
|
|
|
else: |
|
st.write("Error: selection not recognized") |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
main() |