import streamlit as st
import requests
import pandas as pd
import json
import os
from datasets import load_dataset

# Set page configuration
st.set_page_config(
    page_title="Huggingface Repository Explorer",
    page_icon="🤗",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Title and description
st.title("🤗 Huggingface Repository Explorer")
st.markdown("""
This dashboard showcases our models and datasets on Huggingface.
Select a dataset to view sample data.
""")

# Access token will be set up via environment variable in the Huggingface Space
# This way it's not exposed in the code and users don't need to enter it
AUTH_TOKEN = os.environ.get("HF_TOKEN", "")

# HF API endpoints
HF_API_BASE = "https://huggingface.co/api"

# Function to fetch dataset samples using the pre-configured token
def fetch_dataset_samples(dataset_id, n=10):
    try:
        # Load the dataset in streaming mode
        dataset = load_dataset(dataset_id, 
                              split="train", 
                              streaming=True,
                              token=AUTH_TOKEN)
        
        # Get the first n examples
        samples = []
        for i, example in enumerate(dataset):
            if i >= n:
                break
            samples.append(example)
            
        return samples
    except Exception as e:
        st.error(f"Error loading dataset samples: {e}")
        return None

# Hard-coded model list
model_data = {
    "Model Name": [
        "TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-c_sharp", 
        "TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-python", 
        "TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-C", 
        "TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-java", 
        "TitanCAProject/CodeBERT-javascript"
    ],
    "Description": [
        "Qwen2.5 model for the Csharp language", 
        "Qwen2.5 model for the Python language", 
        "Qwen2.5 model for the C language", 
        "Qwen2.5 model for the Jave language", 
        "CodeBERT model for the Javascript language"
    ],
    "Size (GB)": [0.4, 0.5, 0.9, 1.3, 0.3],
    "Last Updated": [
        "2024-11-15", 
        "2024-10-30", 
        "2024-12-05", 
        "2024-11-20", 
        "2024-12-10"
    ]
}

# Convert to DataFrames
df_models = pd.DataFrame(model_data)

# Function to fetch dataset info including size and sample count
def fetch_dataset_info(dataset_id):
    headers = {"Authorization": f"Bearer {AUTH_TOKEN}"}
    size_url = f"https://datasets-server.huggingface.co/size?dataset={dataset_id}"
    url = f"{HF_API_BASE}/datasets/{dataset_id}"
    
    try:
        response = requests.get(size_url, headers=headers)
        if response.status_code != 200:
            st.warning(f"Error fetching dataset size info: {response.status_code}")
            return None
        dataset_info = response.json()
        
        # Get size information - need to calculate
        size_bytes = dataset_info['size']['dataset'].get('num_bytes_original_files', 0)
        # Convert to MB for display
        size_mb = round(size_bytes / (1024 * 1024), 2) if size_bytes else None

        # Get row count information
        sample_count = dataset_info['size']['dataset'].get('num_rows', 0)

        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            st.warning(f"Error fetching dataset info: {response.status_code}")
            return None
        dataset_info = response.json()
        
        result = {
            'id': dataset_id,
            'description': dataset_info.get('description', 'No description available'),
            'size_mb': size_mb,
            'sample_count': sample_count,
            'last_modified': dataset_info.get('lastModified', 'Unknown')
        }
        return result
        
    except Exception as e:
        st.error(f"Error processing dataset info: {e}")
        return None
    
# Main tabs
tab1, tab2 = st.tabs(["Models", "Datasets"])

# Models Tab
with tab1:
    st.header("Models")
    
    # Display models table
    st.dataframe(df_models, use_container_width=True)
    
    # Selected model details
    st.subheader("Model Details")
    selected_model = st.selectbox("Select a model for details", df_models["Model Name"], key="model_select")
    
    if selected_model:
        model_details = df_models[df_models["Model Name"] == selected_model].iloc[0]
        
        st.markdown("### " + model_details["Model Name"])
        st.markdown(f"**Description**: {model_details['Description']}")
        st.markdown(f"**Size**: {model_details['Size (GB)']} GB")
        st.markdown(f"**Last Updated**: {model_details['Last Updated']}")
        

with tab2:
    st.header("Datasets")
    
    # List of dataset IDs to display
    dataset_ids = [
        "YChang1112/test-dataset",
        "Anthropic/EconomicIndex"
    ]
    
    # Get actual dataset info from API
    dataset_info_list = []
    if AUTH_TOKEN:
        with st.spinner("Loading dataset information..."):
            for dataset_id in dataset_ids:
                info = fetch_dataset_info(dataset_id)
                if info:
                    dataset_info_list.append(info)
    else:
        st.warning("Authentication token not configured. Unable to fetch dataset information.")
    
    # Create a DataFrame from the collected information
    if dataset_info_list:
        df_datasets = pd.DataFrame({
            "Dataset Name": [info['id'] for info in dataset_info_list],
            "Description": [info['description'] for info in dataset_info_list],
            "Size (MB)": [info['size_mb'] for info in dataset_info_list],
            "Samples": [info['sample_count'] for info in dataset_info_list],
            "Last Modified": [info['last_modified'] for info in dataset_info_list]
        })
        
        # Display datasets table
        st.dataframe(df_datasets, use_container_width=True)
    else:
        st.error("No dataset information available. Please check your dataset IDs and authentication token.")

    
    # Dataset details with sample preview
    st.subheader("Dataset Preview")
    
    if dataset_info_list:
        selected_dataset = st.selectbox("Select a dataset to preview", 
                                      [info['id'] for info in dataset_info_list], 
                                      key="dataset_select")
        
        if selected_dataset:
            # Find the dataset info
            dataset_info = next((info for info in dataset_info_list if info['id'] == selected_dataset), None)
            
            if dataset_info:
                st.markdown(f"### {dataset_info['id']}")
                st.markdown(f"**Description**: {dataset_info['description']}")
                st.markdown(f"**Size**: {dataset_info['size_mb']} MB")
                st.markdown(f"**Total Samples**: {dataset_info['sample_count']:,}")
                st.markdown(f"**Last Modified**: {dataset_info['last_modified']}")
                
                # Show dataset samples
                st.markdown("### Sample Train Data")
                
                with st.spinner("Fetching dataset samples..."):
                    samples = fetch_dataset_samples(selected_dataset)
                    
                    if samples:
                        # Convert samples to DataFrame if possible
                        try:
                            # If it's a list of samples
                            if isinstance(samples, list) and len(samples) > 0:
                                # Try to normalize to handle nested structures
                                df_sample = pd.json_normalize(samples)
                                st.dataframe(df_sample, use_container_width=True)
                            # If it's a single sample object
                            elif isinstance(samples, dict):
                                df_sample = pd.DataFrame([samples])
                                st.dataframe(df_sample, use_container_width=True)
                            else:
                                st.json(samples)
                        except Exception as e:
                            st.error(f"Error displaying samples: {e}")
                            st.json(samples)  # Fallback to raw JSON display
                    else:
                        st.warning("Could not fetch dataset samples.")

# Footer
st.markdown("---")
st.markdown("Repository Explorer | Last updated: April 2025")