Spaces:
Sleeping
Sleeping
import streamlit as st | |
import numpy as np | |
import pandas as pd | |
from datasets import load_dataset | |
st.set_page_config(layout="wide") | |
col1, col2 = st.columns([2, 3]) # Adjust the width ratio as needed | |
sources = [ | |
"https://huggingface.co/datasets/cfahlgren1/hub-stats", | |
"https://huggingface.co/datasets/maxiw/hf-posts", | |
] | |
with col1: | |
st.header("HuggingFace 🤗 Posts leaderboard") | |
with col2: | |
selected_source = st.selectbox( | |
"Data Source:", | |
options=sources, | |
index=0, | |
) | |
if selected_source == sources[0]: | |
try: | |
df = pd.read_parquet("hf://datasets/cfahlgren1/hub-stats/posts.parquet") | |
# ds = load_dataset("cfahlgren1/hub-stats", "posts") | |
# df = pd.DataFrame(ds['train']).info() | |
df["Name"] = df.fullname | |
df["username"] = df.name | |
except Exception as exp: | |
st.error(f''' | |
ERROR>> in loading {selected_source} | |
>> {exp}''', icon="🚨") | |
selected_source = sources[1] | |
st.info(f''' | |
This can be solved by "Space Restart" | |
Switching Sources for now... | |
New Source: {selected_source}''', icon="ℹ️") | |
if selected_source == sources[1]: | |
df = pd.read_json("hf://datasets/maxiw/hf-posts/posts.jsonl", lines=True) | |
df["publishedAt"] = pd.to_datetime(df.publishedAt) | |
print(">>> ", df.columns) | |
df["Name"] = df.author.apply(lambda x: x["fullname"]) | |
df["username"] = df.author.apply(lambda x: x["name"]) | |
# Define the metrics | |
metrics = ["totalUniqueImpressions", "totalReactions", "numComments", "Num of posts"] | |
# Get min and max dates from the DataFrame | |
min_date = df["publishedAt"].min().to_pydatetime() | |
max_date = df["publishedAt"].max().to_pydatetime() | |
# Create columns for the slider and the selectbox | |
col1, col2 = st.columns([3, 1]) # Adjust the width ratio as needed | |
with col1: | |
date_range = st.slider( | |
"Select Date Range", | |
min_value=min_date, | |
max_value=max_date, | |
value=(min_date, max_date), | |
format="DD/MMM/YYYY", | |
) | |
with col2: | |
selected_metric = st.selectbox( | |
"Sort by:", | |
options=metrics, | |
index=0, | |
) | |
# Filter the DataFrame based on selected date range | |
mask = df["publishedAt"].between(*date_range) | |
df = df[mask] | |
df["totalReactions"] = df.reactions.apply(lambda x: sum([_["count"] for _ in x])) | |
df["Num of posts"] = 1 | |
# Ensure metrics columns are integers, handling NaN values | |
df[metrics] = df[metrics].fillna(0).astype(int) | |
data = ( | |
df.groupby(["username", "Name"])[metrics] | |
.sum() | |
.sort_values(selected_metric, ascending=False) | |
.reset_index() | |
) | |
data.index = np.arange(1, len(data) + 1) | |
data.index.name = "Rank" | |
# Format metrics columns with commas | |
data[metrics] = data[metrics].applymap(lambda x: f"{x:,}") | |
def make_clickable(val): | |
return f'<a target="_blank" href="https://huggingface.co/{val}">{val}</a>' | |
df_styled = data.style.format({"username": make_clickable}) | |
st.write( | |
f"""<center>{df_styled.to_html(escape=False, index=False)}""", | |
unsafe_allow_html=True, | |
) | |