Spaces:

YChang1112
/

Titan_Project_Dashboard

Running

App Files Files Community

Titan_Project_Dashboard / app.py

YChang1112

Initial prototype

f5fb58c verified 25 days ago

raw

history blame contribute delete

8.51 kB

	import streamlit as st
	import requests
	import pandas as pd
	import json
	import os
	from datasets import load_dataset

	# Set page configuration
	st.set_page_config(
	page_title="Huggingface Repository Explorer",
	page_icon="🤗",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Title and description
	st.title("🤗 Huggingface Repository Explorer")
	st.markdown("""
	This dashboard showcases our models and datasets on Huggingface.
	Select a dataset to view sample data.
	""")

	# Access token will be set up via environment variable in the Huggingface Space
	# This way it's not exposed in the code and users don't need to enter it
	AUTH_TOKEN = os.environ.get("HF_TOKEN", "")

	# HF API endpoints
	HF_API_BASE = "https://huggingface.co/api"

	# Function to fetch dataset samples using the pre-configured token
	def fetch_dataset_samples(dataset_id, n=10):
	try:
	# Load the dataset in streaming mode
	dataset = load_dataset(dataset_id,
	split="train",
	streaming=True,
	token=AUTH_TOKEN)

	# Get the first n examples
	samples = []
	for i, example in enumerate(dataset):
	if i >= n:
	break
	samples.append(example)

	return samples
	except Exception as e:
	st.error(f"Error loading dataset samples: {e}")
	return None

	# Hard-coded model list
	model_data = {
	"Model Name": [
	"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-c_sharp",
	"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-python",
	"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-C",
	"TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-java",
	"TitanCAProject/CodeBERT-javascript"
	],
	"Description": [
	"Qwen2.5 model for the Csharp language",
	"Qwen2.5 model for the Python language",
	"Qwen2.5 model for the C language",
	"Qwen2.5 model for the Jave language",
	"CodeBERT model for the Javascript language"
	],
	"Size (GB)": [0.4, 0.5, 0.9, 1.3, 0.3],
	"Last Updated": [
	"2024-11-15",
	"2024-10-30",
	"2024-12-05",
	"2024-11-20",
	"2024-12-10"
	]
	}

	# Convert to DataFrames
	df_models = pd.DataFrame(model_data)

	# Function to fetch dataset info including size and sample count
	def fetch_dataset_info(dataset_id):
	headers = {"Authorization": f"Bearer {AUTH_TOKEN}"}
	size_url = f"https://datasets-server.huggingface.co/size?dataset={dataset_id}"
	url = f"{HF_API_BASE}/datasets/{dataset_id}"

	try:
	response = requests.get(size_url, headers=headers)
	if response.status_code != 200:
	st.warning(f"Error fetching dataset size info: {response.status_code}")
	return None
	dataset_info = response.json()

	# Get size information - need to calculate
	size_bytes = dataset_info['size']['dataset'].get('num_bytes_original_files', 0)
	# Convert to MB for display
	size_mb = round(size_bytes / (1024 * 1024), 2) if size_bytes else None

	# Get row count information
	sample_count = dataset_info['size']['dataset'].get('num_rows', 0)

	response = requests.get(url, headers=headers)
	if response.status_code != 200:
	st.warning(f"Error fetching dataset info: {response.status_code}")
	return None
	dataset_info = response.json()

	result = {
	'id': dataset_id,
	'description': dataset_info.get('description', 'No description available'),
	'size_mb': size_mb,
	'sample_count': sample_count,
	'last_modified': dataset_info.get('lastModified', 'Unknown')
	}
	return result

	except Exception as e:
	st.error(f"Error processing dataset info: {e}")
	return None

	# Main tabs
	tab1, tab2 = st.tabs(["Models", "Datasets"])

	# Models Tab
	with tab1:
	st.header("Models")

	# Display models table
	st.dataframe(df_models, use_container_width=True)

	# Selected model details
	st.subheader("Model Details")
	selected_model = st.selectbox("Select a model for details", df_models["Model Name"], key="model_select")

	if selected_model:
	model_details = df_models[df_models["Model Name"] == selected_model].iloc[0]

	st.markdown("### " + model_details["Model Name"])
	st.markdown(f"Description: {model_details['Description']}")
	st.markdown(f"Size: {model_details['Size (GB)']} GB")
	st.markdown(f"Last Updated: {model_details['Last Updated']}")


	with tab2:
	st.header("Datasets")

	# List of dataset IDs to display
	dataset_ids = [
	"YChang1112/test-dataset",
	"Anthropic/EconomicIndex"
	]

	# Get actual dataset info from API
	dataset_info_list = []
	if AUTH_TOKEN:
	with st.spinner("Loading dataset information..."):
	for dataset_id in dataset_ids:
	info = fetch_dataset_info(dataset_id)
	if info:
	dataset_info_list.append(info)
	else:
	st.warning("Authentication token not configured. Unable to fetch dataset information.")

	# Create a DataFrame from the collected information
	if dataset_info_list:
	df_datasets = pd.DataFrame({
	"Dataset Name": [info['id'] for info in dataset_info_list],
	"Description": [info['description'] for info in dataset_info_list],
	"Size (MB)": [info['size_mb'] for info in dataset_info_list],
	"Samples": [info['sample_count'] for info in dataset_info_list],
	"Last Modified": [info['last_modified'] for info in dataset_info_list]
	})

	# Display datasets table
	st.dataframe(df_datasets, use_container_width=True)
	else:
	st.error("No dataset information available. Please check your dataset IDs and authentication token.")


	# Dataset details with sample preview
	st.subheader("Dataset Preview")

	if dataset_info_list:
	selected_dataset = st.selectbox("Select a dataset to preview",
	[info['id'] for info in dataset_info_list],
	key="dataset_select")

	if selected_dataset:
	# Find the dataset info
	dataset_info = next((info for info in dataset_info_list if info['id'] == selected_dataset), None)

	if dataset_info:
	st.markdown(f"### {dataset_info['id']}")
	st.markdown(f"Description: {dataset_info['description']}")
	st.markdown(f"Size: {dataset_info['size_mb']} MB")
	st.markdown(f"Total Samples: {dataset_info['sample_count']:,}")
	st.markdown(f"Last Modified: {dataset_info['last_modified']}")

	# Show dataset samples
	st.markdown("### Sample Train Data")

	with st.spinner("Fetching dataset samples..."):
	samples = fetch_dataset_samples(selected_dataset)

	if samples:
	# Convert samples to DataFrame if possible
	try:
	# If it's a list of samples
	if isinstance(samples, list) and len(samples) > 0:
	# Try to normalize to handle nested structures
	df_sample = pd.json_normalize(samples)
	st.dataframe(df_sample, use_container_width=True)
	# If it's a single sample object
	elif isinstance(samples, dict):
	df_sample = pd.DataFrame([samples])
	st.dataframe(df_sample, use_container_width=True)
	else:
	st.json(samples)
	except Exception as e:
	st.error(f"Error displaying samples: {e}")
	st.json(samples) # Fallback to raw JSON display
	else:
	st.warning("Could not fetch dataset samples.")

	# Footer
	st.markdown("---")
	st.markdown("Repository Explorer \| Last updated: April 2025")