YChang1112's picture
Initial prototype
f5fb58c verified
import streamlit as st
import requests
import pandas as pd
import json
import os
from datasets import load_dataset
# Set page configuration
st.set_page_config(
page_title="Huggingface Repository Explorer",
page_icon="πŸ€—",
layout="wide",
initial_sidebar_state="expanded"
)
# Title and description
st.title("πŸ€— Huggingface Repository Explorer")
st.markdown("""
This dashboard showcases our models and datasets on Huggingface.
Select a dataset to view sample data.
""")
# Access token will be set up via environment variable in the Huggingface Space
# This way it's not exposed in the code and users don't need to enter it
AUTH_TOKEN = os.environ.get("HF_TOKEN", "")
# HF API endpoints
HF_API_BASE = "https://huggingface.co/api"
# Function to fetch dataset samples using the pre-configured token
def fetch_dataset_samples(dataset_id, n=10):
try:
# Load the dataset in streaming mode
dataset = load_dataset(dataset_id,
split="train",
streaming=True,
token=AUTH_TOKEN)
# Get the first n examples
samples = []
for i, example in enumerate(dataset):
if i >= n:
break
samples.append(example)
return samples
except Exception as e:
st.error(f"Error loading dataset samples: {e}")
return None
# Hard-coded model list
model_data = {
"Model Name": [
"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-c_sharp",
"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-python",
"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-C",
"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-java",
"TitanCAProject/CodeBERT-javascript"
],
"Description": [
"Qwen2.5 model for the Csharp language",
"Qwen2.5 model for the Python language",
"Qwen2.5 model for the C language",
"Qwen2.5 model for the Jave language",
"CodeBERT model for the Javascript language"
],
"Size (GB)": [0.4, 0.5, 0.9, 1.3, 0.3],
"Last Updated": [
"2024-11-15",
"2024-10-30",
"2024-12-05",
"2024-11-20",
"2024-12-10"
]
}
# Convert to DataFrames
df_models = pd.DataFrame(model_data)
# Function to fetch dataset info including size and sample count
def fetch_dataset_info(dataset_id):
headers = {"Authorization": f"Bearer {AUTH_TOKEN}"}
size_url = f"https://datasets-server.huggingface.co/size?dataset={dataset_id}"
url = f"{HF_API_BASE}/datasets/{dataset_id}"
try:
response = requests.get(size_url, headers=headers)
if response.status_code != 200:
st.warning(f"Error fetching dataset size info: {response.status_code}")
return None
dataset_info = response.json()
# Get size information - need to calculate
size_bytes = dataset_info['size']['dataset'].get('num_bytes_original_files', 0)
# Convert to MB for display
size_mb = round(size_bytes / (1024 * 1024), 2) if size_bytes else None
# Get row count information
sample_count = dataset_info['size']['dataset'].get('num_rows', 0)
response = requests.get(url, headers=headers)
if response.status_code != 200:
st.warning(f"Error fetching dataset info: {response.status_code}")
return None
dataset_info = response.json()
result = {
'id': dataset_id,
'description': dataset_info.get('description', 'No description available'),
'size_mb': size_mb,
'sample_count': sample_count,
'last_modified': dataset_info.get('lastModified', 'Unknown')
}
return result
except Exception as e:
st.error(f"Error processing dataset info: {e}")
return None
# Main tabs
tab1, tab2 = st.tabs(["Models", "Datasets"])
# Models Tab
with tab1:
st.header("Models")
# Display models table
st.dataframe(df_models, use_container_width=True)
# Selected model details
st.subheader("Model Details")
selected_model = st.selectbox("Select a model for details", df_models["Model Name"], key="model_select")
if selected_model:
model_details = df_models[df_models["Model Name"] == selected_model].iloc[0]
st.markdown("### " + model_details["Model Name"])
st.markdown(f"**Description**: {model_details['Description']}")
st.markdown(f"**Size**: {model_details['Size (GB)']} GB")
st.markdown(f"**Last Updated**: {model_details['Last Updated']}")
with tab2:
st.header("Datasets")
# List of dataset IDs to display
dataset_ids = [
"YChang1112/test-dataset",
"Anthropic/EconomicIndex"
]
# Get actual dataset info from API
dataset_info_list = []
if AUTH_TOKEN:
with st.spinner("Loading dataset information..."):
for dataset_id in dataset_ids:
info = fetch_dataset_info(dataset_id)
if info:
dataset_info_list.append(info)
else:
st.warning("Authentication token not configured. Unable to fetch dataset information.")
# Create a DataFrame from the collected information
if dataset_info_list:
df_datasets = pd.DataFrame({
"Dataset Name": [info['id'] for info in dataset_info_list],
"Description": [info['description'] for info in dataset_info_list],
"Size (MB)": [info['size_mb'] for info in dataset_info_list],
"Samples": [info['sample_count'] for info in dataset_info_list],
"Last Modified": [info['last_modified'] for info in dataset_info_list]
})
# Display datasets table
st.dataframe(df_datasets, use_container_width=True)
else:
st.error("No dataset information available. Please check your dataset IDs and authentication token.")
# Dataset details with sample preview
st.subheader("Dataset Preview")
if dataset_info_list:
selected_dataset = st.selectbox("Select a dataset to preview",
[info['id'] for info in dataset_info_list],
key="dataset_select")
if selected_dataset:
# Find the dataset info
dataset_info = next((info for info in dataset_info_list if info['id'] == selected_dataset), None)
if dataset_info:
st.markdown(f"### {dataset_info['id']}")
st.markdown(f"**Description**: {dataset_info['description']}")
st.markdown(f"**Size**: {dataset_info['size_mb']} MB")
st.markdown(f"**Total Samples**: {dataset_info['sample_count']:,}")
st.markdown(f"**Last Modified**: {dataset_info['last_modified']}")
# Show dataset samples
st.markdown("### Sample Train Data")
with st.spinner("Fetching dataset samples..."):
samples = fetch_dataset_samples(selected_dataset)
if samples:
# Convert samples to DataFrame if possible
try:
# If it's a list of samples
if isinstance(samples, list) and len(samples) > 0:
# Try to normalize to handle nested structures
df_sample = pd.json_normalize(samples)
st.dataframe(df_sample, use_container_width=True)
# If it's a single sample object
elif isinstance(samples, dict):
df_sample = pd.DataFrame([samples])
st.dataframe(df_sample, use_container_width=True)
else:
st.json(samples)
except Exception as e:
st.error(f"Error displaying samples: {e}")
st.json(samples) # Fallback to raw JSON display
else:
st.warning("Could not fetch dataset samples.")
# Footer
st.markdown("---")
st.markdown("Repository Explorer | Last updated: April 2025")