|
import streamlit as st |
|
import requests |
|
import pandas as pd |
|
import json |
|
import os |
|
from datasets import load_dataset |
|
|
|
|
|
st.set_page_config( |
|
page_title="Huggingface Repository Explorer", |
|
page_icon="π€", |
|
layout="wide", |
|
initial_sidebar_state="expanded" |
|
) |
|
|
|
|
|
st.title("π€ Huggingface Repository Explorer") |
|
st.markdown(""" |
|
This dashboard showcases our models and datasets on Huggingface. |
|
Select a dataset to view sample data. |
|
""") |
|
|
|
|
|
|
|
AUTH_TOKEN = os.environ.get("HF_TOKEN", "") |
|
|
|
|
|
HF_API_BASE = "https://huggingface.co/api" |
|
|
|
|
|
def fetch_dataset_samples(dataset_id, n=10): |
|
try: |
|
|
|
dataset = load_dataset(dataset_id, |
|
split="train", |
|
streaming=True, |
|
token=AUTH_TOKEN) |
|
|
|
|
|
samples = [] |
|
for i, example in enumerate(dataset): |
|
if i >= n: |
|
break |
|
samples.append(example) |
|
|
|
return samples |
|
except Exception as e: |
|
st.error(f"Error loading dataset samples: {e}") |
|
return None |
|
|
|
|
|
model_data = { |
|
"Model Name": [ |
|
"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-c_sharp", |
|
"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-python", |
|
"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-C", |
|
"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-java", |
|
"TitanCAProject/CodeBERT-javascript" |
|
], |
|
"Description": [ |
|
"Qwen2.5 model for the Csharp language", |
|
"Qwen2.5 model for the Python language", |
|
"Qwen2.5 model for the C language", |
|
"Qwen2.5 model for the Jave language", |
|
"CodeBERT model for the Javascript language" |
|
], |
|
"Size (GB)": [0.4, 0.5, 0.9, 1.3, 0.3], |
|
"Last Updated": [ |
|
"2024-11-15", |
|
"2024-10-30", |
|
"2024-12-05", |
|
"2024-11-20", |
|
"2024-12-10" |
|
] |
|
} |
|
|
|
|
|
df_models = pd.DataFrame(model_data) |
|
|
|
|
|
def fetch_dataset_info(dataset_id): |
|
headers = {"Authorization": f"Bearer {AUTH_TOKEN}"} |
|
size_url = f"https://datasets-server.huggingface.co/size?dataset={dataset_id}" |
|
url = f"{HF_API_BASE}/datasets/{dataset_id}" |
|
|
|
try: |
|
response = requests.get(size_url, headers=headers) |
|
if response.status_code != 200: |
|
st.warning(f"Error fetching dataset size info: {response.status_code}") |
|
return None |
|
dataset_info = response.json() |
|
|
|
|
|
size_bytes = dataset_info['size']['dataset'].get('num_bytes_original_files', 0) |
|
|
|
size_mb = round(size_bytes / (1024 * 1024), 2) if size_bytes else None |
|
|
|
|
|
sample_count = dataset_info['size']['dataset'].get('num_rows', 0) |
|
|
|
response = requests.get(url, headers=headers) |
|
if response.status_code != 200: |
|
st.warning(f"Error fetching dataset info: {response.status_code}") |
|
return None |
|
dataset_info = response.json() |
|
|
|
result = { |
|
'id': dataset_id, |
|
'description': dataset_info.get('description', 'No description available'), |
|
'size_mb': size_mb, |
|
'sample_count': sample_count, |
|
'last_modified': dataset_info.get('lastModified', 'Unknown') |
|
} |
|
return result |
|
|
|
except Exception as e: |
|
st.error(f"Error processing dataset info: {e}") |
|
return None |
|
|
|
|
|
tab1, tab2 = st.tabs(["Models", "Datasets"]) |
|
|
|
|
|
with tab1: |
|
st.header("Models") |
|
|
|
|
|
st.dataframe(df_models, use_container_width=True) |
|
|
|
|
|
st.subheader("Model Details") |
|
selected_model = st.selectbox("Select a model for details", df_models["Model Name"], key="model_select") |
|
|
|
if selected_model: |
|
model_details = df_models[df_models["Model Name"] == selected_model].iloc[0] |
|
|
|
st.markdown("### " + model_details["Model Name"]) |
|
st.markdown(f"**Description**: {model_details['Description']}") |
|
st.markdown(f"**Size**: {model_details['Size (GB)']} GB") |
|
st.markdown(f"**Last Updated**: {model_details['Last Updated']}") |
|
|
|
|
|
with tab2: |
|
st.header("Datasets") |
|
|
|
|
|
dataset_ids = [ |
|
"YChang1112/test-dataset", |
|
"Anthropic/EconomicIndex" |
|
] |
|
|
|
|
|
dataset_info_list = [] |
|
if AUTH_TOKEN: |
|
with st.spinner("Loading dataset information..."): |
|
for dataset_id in dataset_ids: |
|
info = fetch_dataset_info(dataset_id) |
|
if info: |
|
dataset_info_list.append(info) |
|
else: |
|
st.warning("Authentication token not configured. Unable to fetch dataset information.") |
|
|
|
|
|
if dataset_info_list: |
|
df_datasets = pd.DataFrame({ |
|
"Dataset Name": [info['id'] for info in dataset_info_list], |
|
"Description": [info['description'] for info in dataset_info_list], |
|
"Size (MB)": [info['size_mb'] for info in dataset_info_list], |
|
"Samples": [info['sample_count'] for info in dataset_info_list], |
|
"Last Modified": [info['last_modified'] for info in dataset_info_list] |
|
}) |
|
|
|
|
|
st.dataframe(df_datasets, use_container_width=True) |
|
else: |
|
st.error("No dataset information available. Please check your dataset IDs and authentication token.") |
|
|
|
|
|
|
|
st.subheader("Dataset Preview") |
|
|
|
if dataset_info_list: |
|
selected_dataset = st.selectbox("Select a dataset to preview", |
|
[info['id'] for info in dataset_info_list], |
|
key="dataset_select") |
|
|
|
if selected_dataset: |
|
|
|
dataset_info = next((info for info in dataset_info_list if info['id'] == selected_dataset), None) |
|
|
|
if dataset_info: |
|
st.markdown(f"### {dataset_info['id']}") |
|
st.markdown(f"**Description**: {dataset_info['description']}") |
|
st.markdown(f"**Size**: {dataset_info['size_mb']} MB") |
|
st.markdown(f"**Total Samples**: {dataset_info['sample_count']:,}") |
|
st.markdown(f"**Last Modified**: {dataset_info['last_modified']}") |
|
|
|
|
|
st.markdown("### Sample Train Data") |
|
|
|
with st.spinner("Fetching dataset samples..."): |
|
samples = fetch_dataset_samples(selected_dataset) |
|
|
|
if samples: |
|
|
|
try: |
|
|
|
if isinstance(samples, list) and len(samples) > 0: |
|
|
|
df_sample = pd.json_normalize(samples) |
|
st.dataframe(df_sample, use_container_width=True) |
|
|
|
elif isinstance(samples, dict): |
|
df_sample = pd.DataFrame([samples]) |
|
st.dataframe(df_sample, use_container_width=True) |
|
else: |
|
st.json(samples) |
|
except Exception as e: |
|
st.error(f"Error displaying samples: {e}") |
|
st.json(samples) |
|
else: |
|
st.warning("Could not fetch dataset samples.") |
|
|
|
|
|
st.markdown("---") |
|
st.markdown("Repository Explorer | Last updated: April 2025") |
|
|
|
|