import streamlit as st import requests import pandas as pd import json import os from datasets import load_dataset # Set page configuration st.set_page_config( page_title="Huggingface Repository Explorer", page_icon="🤗", layout="wide", initial_sidebar_state="expanded" ) # Title and description st.title("🤗 Huggingface Repository Explorer") st.markdown(""" This dashboard showcases our models and datasets on Huggingface. Select a dataset to view sample data. """) # Access token will be set up via environment variable in the Huggingface Space # This way it's not exposed in the code and users don't need to enter it AUTH_TOKEN = os.environ.get("HF_TOKEN", "") # HF API endpoints HF_API_BASE = "https://huggingface.co/api" # Function to fetch dataset samples using the pre-configured token def fetch_dataset_samples(dataset_id, n=10): try: # Load the dataset in streaming mode dataset = load_dataset(dataset_id, split="train", streaming=True, token=AUTH_TOKEN) # Get the first n examples samples = [] for i, example in enumerate(dataset): if i >= n: break samples.append(example) return samples except Exception as e: st.error(f"Error loading dataset samples: {e}") return None # Hard-coded model list model_data = { "Model Name": [ "TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-c_sharp", "TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-python", "TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-C", "TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-java", "TitanCAProject/CodeBERT-javascript" ], "Description": [ "Qwen2.5 model for the Csharp language", "Qwen2.5 model for the Python language", "Qwen2.5 model for the C language", "Qwen2.5 model for the Jave language", "CodeBERT model for the Javascript language" ], "Size (GB)": [0.4, 0.5, 0.9, 1.3, 0.3], "Last Updated": [ "2024-11-15", "2024-10-30", "2024-12-05", "2024-11-20", "2024-12-10" ] } # Convert to DataFrames df_models = pd.DataFrame(model_data) # Function to fetch dataset info including size and sample count def fetch_dataset_info(dataset_id): headers = {"Authorization": f"Bearer {AUTH_TOKEN}"} size_url = f"https://datasets-server.huggingface.co/size?dataset={dataset_id}" url = f"{HF_API_BASE}/datasets/{dataset_id}" try: response = requests.get(size_url, headers=headers) if response.status_code != 200: st.warning(f"Error fetching dataset size info: {response.status_code}") return None dataset_info = response.json() # Get size information - need to calculate size_bytes = dataset_info['size']['dataset'].get('num_bytes_original_files', 0) # Convert to MB for display size_mb = round(size_bytes / (1024 * 1024), 2) if size_bytes else None # Get row count information sample_count = dataset_info['size']['dataset'].get('num_rows', 0) response = requests.get(url, headers=headers) if response.status_code != 200: st.warning(f"Error fetching dataset info: {response.status_code}") return None dataset_info = response.json() result = { 'id': dataset_id, 'description': dataset_info.get('description', 'No description available'), 'size_mb': size_mb, 'sample_count': sample_count, 'last_modified': dataset_info.get('lastModified', 'Unknown') } return result except Exception as e: st.error(f"Error processing dataset info: {e}") return None # Main tabs tab1, tab2 = st.tabs(["Models", "Datasets"]) # Models Tab with tab1: st.header("Models") # Display models table st.dataframe(df_models, use_container_width=True) # Selected model details st.subheader("Model Details") selected_model = st.selectbox("Select a model for details", df_models["Model Name"], key="model_select") if selected_model: model_details = df_models[df_models["Model Name"] == selected_model].iloc[0] st.markdown("### " + model_details["Model Name"]) st.markdown(f"**Description**: {model_details['Description']}") st.markdown(f"**Size**: {model_details['Size (GB)']} GB") st.markdown(f"**Last Updated**: {model_details['Last Updated']}") with tab2: st.header("Datasets") # List of dataset IDs to display dataset_ids = [ "YChang1112/test-dataset", "Anthropic/EconomicIndex" ] # Get actual dataset info from API dataset_info_list = [] if AUTH_TOKEN: with st.spinner("Loading dataset information..."): for dataset_id in dataset_ids: info = fetch_dataset_info(dataset_id) if info: dataset_info_list.append(info) else: st.warning("Authentication token not configured. Unable to fetch dataset information.") # Create a DataFrame from the collected information if dataset_info_list: df_datasets = pd.DataFrame({ "Dataset Name": [info['id'] for info in dataset_info_list], "Description": [info['description'] for info in dataset_info_list], "Size (MB)": [info['size_mb'] for info in dataset_info_list], "Samples": [info['sample_count'] for info in dataset_info_list], "Last Modified": [info['last_modified'] for info in dataset_info_list] }) # Display datasets table st.dataframe(df_datasets, use_container_width=True) else: st.error("No dataset information available. Please check your dataset IDs and authentication token.") # Dataset details with sample preview st.subheader("Dataset Preview") if dataset_info_list: selected_dataset = st.selectbox("Select a dataset to preview", [info['id'] for info in dataset_info_list], key="dataset_select") if selected_dataset: # Find the dataset info dataset_info = next((info for info in dataset_info_list if info['id'] == selected_dataset), None) if dataset_info: st.markdown(f"### {dataset_info['id']}") st.markdown(f"**Description**: {dataset_info['description']}") st.markdown(f"**Size**: {dataset_info['size_mb']} MB") st.markdown(f"**Total Samples**: {dataset_info['sample_count']:,}") st.markdown(f"**Last Modified**: {dataset_info['last_modified']}") # Show dataset samples st.markdown("### Sample Train Data") with st.spinner("Fetching dataset samples..."): samples = fetch_dataset_samples(selected_dataset) if samples: # Convert samples to DataFrame if possible try: # If it's a list of samples if isinstance(samples, list) and len(samples) > 0: # Try to normalize to handle nested structures df_sample = pd.json_normalize(samples) st.dataframe(df_sample, use_container_width=True) # If it's a single sample object elif isinstance(samples, dict): df_sample = pd.DataFrame([samples]) st.dataframe(df_sample, use_container_width=True) else: st.json(samples) except Exception as e: st.error(f"Error displaying samples: {e}") st.json(samples) # Fallback to raw JSON display else: st.warning("Could not fetch dataset samples.") # Footer st.markdown("---") st.markdown("Repository Explorer | Last updated: April 2025")