Spaces:

nielsr
/

community-science-progress

Running

App Files Files Community

community-science-progress / app.py

nielsr HF Staff

Update requirements

3170ddb 10 months ago

raw

history blame

7.33 kB

	from datetime import datetime

	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt

	# from load_dataframe import get_data
	from urllib.parse import quote


	def aggregated_data(df, aggregation_level="week"):

	st.write(f"Aggregated data by {aggregation_level}")

	# Create a column that indicates if a paper has any artifacts
	df['has_artifact'] = (df['num_models'] > 0) \| (df['num_datasets'] > 0) \| (df['num_spaces'] > 0)

	# Resample by week
	freq = 'W' if aggregation_level == "week" else 'ME'
	weekly_total_papers = df.resample(freq).size()
	weekly_papers_with_artifacts = df.resample(freq)['has_artifact'].sum()

	# Calculate the percentage of papers with artifacts
	percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100

	# Calculate the growth rate
	growth_rate = percentage_papers_with_artifacts.pct_change() * 100

	# Display the latest growth rate as a big number
	latest_growth_rate = growth_rate.iloc[-1] if not growth_rate.empty else 0
	st.metric(label=f"{aggregation_level.capitalize()}ly Growth Rate", value=f"{latest_growth_rate:.2f}%")

	# Create the plot
	plt.figure(figsize=(12, 6))
	plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')

	# Set the y-axis limits
	plt.ylim(0, 100)

	plt.xlabel(aggregation_level)
	plt.ylabel('Percentage')
	plt.title('Percentage of Papers with Artifacts (Models, Datasets, Spaces) Over Time')
	plt.legend()
	plt.grid(True)

	# Use Streamlit to display the plot
	st.pyplot(plt)


	def display_data(df):
	df['has_artifact'] = (df['num_models'] > 0) \| (df['num_datasets'] > 0) \| (df['num_spaces'] > 0)
	num_artifacts = df['has_artifact'].sum()
	percentage_of_at_least_one_artifact = num_artifacts / df.shape[0] if df.shape[0] > 0 else 0
	percentage_of_at_least_one_artifact = round(percentage_of_at_least_one_artifact * 100, 2)

	# add reached out column
	df['reached_out'] = [False for _ in range(df.shape[0])]

	st.markdown(f"""
	## {percentage_of_at_least_one_artifact}% papers with at least one 🤗 artifact

	* Number of papers: {df.shape[0]}
	* Number of papers with a Github link: {df['github'].notnull().sum()}
	* Number of papers with at least one HF artifact: {num_artifacts}
	""")

	st.write("Papers with at least one artifact")
	st.data_editor(df[df['has_artifact']],
	hide_index=True,
	column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
	column_config={"github": st.column_config.LinkColumn(),
	"paper_page": st.column_config.LinkColumn(),
	"paper_page_with_title": st.column_config.LinkColumn(display_text=r'\\|(.*)')},
	width=2000,
	key="papers_with_artifacts")

	st.write("Papers without artifacts")
	st.data_editor(df[~df['has_artifact']],
	hide_index=True,
	column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
	column_config={"github": st.column_config.LinkColumn(),
	"paper_page": st.column_config.LinkColumn()},
	width=2000,
	key="papers_without_artifacts")

	st.write("Papers with a HF mention in README but no artifacts")
	st.data_editor(df[(df['hf_mention'] == 1) & (~df['has_artifact'])],
	hide_index=True,
	column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
	column_config={"github": st.column_config.LinkColumn(),
	"paper_page": st.column_config.LinkColumn()},
	width=2000,
	key="papers_with_hf_mention_no_artifacts")


	def main():
	st.title("Hugging Face Artifacts KPI Dashboard")

	# 2 tabs: one for daily data, one for weekly data
	st.sidebar.title("Navigation")
	selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])

	# TODO use this instead
	# df = get_data()
	df = pd.read_csv('/Users/nielsrogge/Downloads/daily_papers_enriched (3).csv')
	df = df.drop(['Unnamed: 0'], axis=1) if 'Unnamed: 0' in df.columns else df
	# Use date as index
	df = df.set_index('date')
	df.index = pd.to_datetime(df.index)
	df = df.sort_index()

	# hack: include title in URL column
	df['updated_url'] = df.apply(lambda row: f'{row["paper_page"]}/title/{quote(row["title"])}', axis=1)

	if selection == "Daily/weekly/monthly data":
	# Button to select day, month or week
	# Add streamlit selectbox.
	view_level = st.selectbox(label="View data per day, week or month", options=["day", "week", "month"])

	if view_level == "day":
	# make a button to select the day, defaulting to today
	day = st.date_input("Select day", value="today", format="DD/MM/YYYY")
	# convert to the day of a Pandas Timestamp
	day = pd.Timestamp(day)

	print("Day:", day)

	df = df[df.index.date == day.date()]

	st.write(f"Showing data for {day.day_name()} {day.strftime('%d/%m/%Y')}")

	display_data(df)

	elif view_level == "week":
	# make a button to select the week
	week_number = st.number_input("Select week", value=datetime.today().isocalendar()[1], min_value=1, max_value=52)

	# Extract week number from the index
	df['week'] = df.index.isocalendar().week

	# Filter the dataframe for the desired week number
	df = df[df['week'] == week_number]

	st.write(f"Showing data for week {week_number}")

	display_data(df)

	elif view_level == "month":
	# make a button to select the month, defaulting to current month
	month_str = st.selectbox("Select month", options=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
	year_str = st.selectbox("Select year", options=["2024"])

	# Filter the dataframe for the desired week number
	month_map = {
	'January': 1, 'February': 2, 'March': 3, 'April': 4,
	'May': 5, 'June': 6, 'July': 7, 'August': 8,
	'September': 9, 'October': 10, 'November': 11, 'December': 12
	}

	# Convert month string to number
	month = month_map[month_str]
	year = int(year_str)
	df = df[(df.index.month == month) & (df.index.year == year)]

	st.write(f"Showing data for {month_str} {year_str}")

	display_data(df)

	elif selection == "Aggregated data":
	aggregated_data(df)
	aggregated_data(df, aggregation_level="month")

	else:
	st.write("Error: selection not recognized")

	# Display data based on aggregation level



	if __name__ == "__main__":
	main()