YChang1112 commited on
Commit
f5fb58c
·
verified ·
1 Parent(s): 21d363b

Initial prototype

Browse files
Files changed (1) hide show
  1. app.py +227 -0
app.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import pandas as pd
4
+ import json
5
+ import os
6
+ from datasets import load_dataset
7
+
8
+ # Set page configuration
9
+ st.set_page_config(
10
+ page_title="Huggingface Repository Explorer",
11
+ page_icon="🤗",
12
+ layout="wide",
13
+ initial_sidebar_state="expanded"
14
+ )
15
+
16
+ # Title and description
17
+ st.title("🤗 Huggingface Repository Explorer")
18
+ st.markdown("""
19
+ This dashboard showcases our models and datasets on Huggingface.
20
+ Select a dataset to view sample data.
21
+ """)
22
+
23
+ # Access token will be set up via environment variable in the Huggingface Space
24
+ # This way it's not exposed in the code and users don't need to enter it
25
+ AUTH_TOKEN = os.environ.get("HF_TOKEN", "")
26
+
27
+ # HF API endpoints
28
+ HF_API_BASE = "https://huggingface.co/api"
29
+
30
+ # Function to fetch dataset samples using the pre-configured token
31
+ def fetch_dataset_samples(dataset_id, n=10):
32
+ try:
33
+ # Load the dataset in streaming mode
34
+ dataset = load_dataset(dataset_id,
35
+ split="train",
36
+ streaming=True,
37
+ token=AUTH_TOKEN)
38
+
39
+ # Get the first n examples
40
+ samples = []
41
+ for i, example in enumerate(dataset):
42
+ if i >= n:
43
+ break
44
+ samples.append(example)
45
+
46
+ return samples
47
+ except Exception as e:
48
+ st.error(f"Error loading dataset samples: {e}")
49
+ return None
50
+
51
+ # Hard-coded model list
52
+ model_data = {
53
+ "Model Name": [
54
+ "TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-c_sharp",
55
+ "TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-python",
56
+ "TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-C",
57
+ "TitanCAProject/Qwen2.5-Coder-7B-Instruct_lora_r16a32-java",
58
+ "TitanCAProject/CodeBERT-javascript"
59
+ ],
60
+ "Description": [
61
+ "Qwen2.5 model for the Csharp language",
62
+ "Qwen2.5 model for the Python language",
63
+ "Qwen2.5 model for the C language",
64
+ "Qwen2.5 model for the Jave language",
65
+ "CodeBERT model for the Javascript language"
66
+ ],
67
+ "Size (GB)": [0.4, 0.5, 0.9, 1.3, 0.3],
68
+ "Last Updated": [
69
+ "2024-11-15",
70
+ "2024-10-30",
71
+ "2024-12-05",
72
+ "2024-11-20",
73
+ "2024-12-10"
74
+ ]
75
+ }
76
+
77
+ # Convert to DataFrames
78
+ df_models = pd.DataFrame(model_data)
79
+
80
+ # Function to fetch dataset info including size and sample count
81
+ def fetch_dataset_info(dataset_id):
82
+ headers = {"Authorization": f"Bearer {AUTH_TOKEN}"}
83
+ size_url = f"https://datasets-server.huggingface.co/size?dataset={dataset_id}"
84
+ url = f"{HF_API_BASE}/datasets/{dataset_id}"
85
+
86
+ try:
87
+ response = requests.get(size_url, headers=headers)
88
+ if response.status_code != 200:
89
+ st.warning(f"Error fetching dataset size info: {response.status_code}")
90
+ return None
91
+ dataset_info = response.json()
92
+
93
+ # Get size information - need to calculate
94
+ size_bytes = dataset_info['size']['dataset'].get('num_bytes_original_files', 0)
95
+ # Convert to MB for display
96
+ size_mb = round(size_bytes / (1024 * 1024), 2) if size_bytes else None
97
+
98
+ # Get row count information
99
+ sample_count = dataset_info['size']['dataset'].get('num_rows', 0)
100
+
101
+ response = requests.get(url, headers=headers)
102
+ if response.status_code != 200:
103
+ st.warning(f"Error fetching dataset info: {response.status_code}")
104
+ return None
105
+ dataset_info = response.json()
106
+
107
+ result = {
108
+ 'id': dataset_id,
109
+ 'description': dataset_info.get('description', 'No description available'),
110
+ 'size_mb': size_mb,
111
+ 'sample_count': sample_count,
112
+ 'last_modified': dataset_info.get('lastModified', 'Unknown')
113
+ }
114
+ return result
115
+
116
+ except Exception as e:
117
+ st.error(f"Error processing dataset info: {e}")
118
+ return None
119
+
120
+ # Main tabs
121
+ tab1, tab2 = st.tabs(["Models", "Datasets"])
122
+
123
+ # Models Tab
124
+ with tab1:
125
+ st.header("Models")
126
+
127
+ # Display models table
128
+ st.dataframe(df_models, use_container_width=True)
129
+
130
+ # Selected model details
131
+ st.subheader("Model Details")
132
+ selected_model = st.selectbox("Select a model for details", df_models["Model Name"], key="model_select")
133
+
134
+ if selected_model:
135
+ model_details = df_models[df_models["Model Name"] == selected_model].iloc[0]
136
+
137
+ st.markdown("### " + model_details["Model Name"])
138
+ st.markdown(f"**Description**: {model_details['Description']}")
139
+ st.markdown(f"**Size**: {model_details['Size (GB)']} GB")
140
+ st.markdown(f"**Last Updated**: {model_details['Last Updated']}")
141
+
142
+
143
+ with tab2:
144
+ st.header("Datasets")
145
+
146
+ # List of dataset IDs to display
147
+ dataset_ids = [
148
+ "YChang1112/test-dataset",
149
+ "Anthropic/EconomicIndex"
150
+ ]
151
+
152
+ # Get actual dataset info from API
153
+ dataset_info_list = []
154
+ if AUTH_TOKEN:
155
+ with st.spinner("Loading dataset information..."):
156
+ for dataset_id in dataset_ids:
157
+ info = fetch_dataset_info(dataset_id)
158
+ if info:
159
+ dataset_info_list.append(info)
160
+ else:
161
+ st.warning("Authentication token not configured. Unable to fetch dataset information.")
162
+
163
+ # Create a DataFrame from the collected information
164
+ if dataset_info_list:
165
+ df_datasets = pd.DataFrame({
166
+ "Dataset Name": [info['id'] for info in dataset_info_list],
167
+ "Description": [info['description'] for info in dataset_info_list],
168
+ "Size (MB)": [info['size_mb'] for info in dataset_info_list],
169
+ "Samples": [info['sample_count'] for info in dataset_info_list],
170
+ "Last Modified": [info['last_modified'] for info in dataset_info_list]
171
+ })
172
+
173
+ # Display datasets table
174
+ st.dataframe(df_datasets, use_container_width=True)
175
+ else:
176
+ st.error("No dataset information available. Please check your dataset IDs and authentication token.")
177
+
178
+
179
+ # Dataset details with sample preview
180
+ st.subheader("Dataset Preview")
181
+
182
+ if dataset_info_list:
183
+ selected_dataset = st.selectbox("Select a dataset to preview",
184
+ [info['id'] for info in dataset_info_list],
185
+ key="dataset_select")
186
+
187
+ if selected_dataset:
188
+ # Find the dataset info
189
+ dataset_info = next((info for info in dataset_info_list if info['id'] == selected_dataset), None)
190
+
191
+ if dataset_info:
192
+ st.markdown(f"### {dataset_info['id']}")
193
+ st.markdown(f"**Description**: {dataset_info['description']}")
194
+ st.markdown(f"**Size**: {dataset_info['size_mb']} MB")
195
+ st.markdown(f"**Total Samples**: {dataset_info['sample_count']:,}")
196
+ st.markdown(f"**Last Modified**: {dataset_info['last_modified']}")
197
+
198
+ # Show dataset samples
199
+ st.markdown("### Sample Train Data")
200
+
201
+ with st.spinner("Fetching dataset samples..."):
202
+ samples = fetch_dataset_samples(selected_dataset)
203
+
204
+ if samples:
205
+ # Convert samples to DataFrame if possible
206
+ try:
207
+ # If it's a list of samples
208
+ if isinstance(samples, list) and len(samples) > 0:
209
+ # Try to normalize to handle nested structures
210
+ df_sample = pd.json_normalize(samples)
211
+ st.dataframe(df_sample, use_container_width=True)
212
+ # If it's a single sample object
213
+ elif isinstance(samples, dict):
214
+ df_sample = pd.DataFrame([samples])
215
+ st.dataframe(df_sample, use_container_width=True)
216
+ else:
217
+ st.json(samples)
218
+ except Exception as e:
219
+ st.error(f"Error displaying samples: {e}")
220
+ st.json(samples) # Fallback to raw JSON display
221
+ else:
222
+ st.warning("Could not fetch dataset samples.")
223
+
224
+ # Footer
225
+ st.markdown("---")
226
+ st.markdown("Repository Explorer | Last updated: April 2025")
227
+