Yoad commited on
Commit
6bf73a6
·
1 Parent(s): cdd4c6d

Add "load from leaderboard" option

Browse files
Files changed (1) hide show
  1. src/app.py +71 -8
src/app.py CHANGED
@@ -1,13 +1,17 @@
1
  import os
2
- import streamlit as st
3
- import pandas as pd
4
- import jiwer
5
- import requests
6
  from datetime import datetime
7
  from pathlib import Path
 
 
 
 
 
 
 
 
8
  from st_fixed_container import st_fixed_container
9
- from visual_eval.visualization import render_visualize_jiwer_result_html
10
  from visual_eval.evaluator import HebrewTextNormalizer
 
11
 
12
  HF_API_TOKEN = None
13
  try:
@@ -31,11 +35,55 @@ if "audio_cache" not in st.session_state:
31
  if "audio_preview_active" not in st.session_state:
32
  st.session_state.audio_preview_active = {}
33
 
 
 
 
34
 
35
  def on_file_upload():
36
  st.session_state.audio_cache = {}
37
  st.session_state.audio_preview_active = {}
38
  st.session_state.selected_entry_idx = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
 
41
  def display_rtl(html):
@@ -211,10 +259,22 @@ def main():
211
 
212
  # File uploader
213
  uploaded_file = st.file_uploader(
214
- "Upload evaluation results CSV", type=["csv"], on_change=on_file_upload
 
 
 
215
  )
216
 
217
  if uploaded_file is not None:
 
 
 
 
 
 
 
 
 
218
  # Load the data
219
  try:
220
  eval_results = pd.read_csv(uploaded_file)
@@ -292,7 +352,7 @@ def main():
292
  use_normalized = st.sidebar.toggle("Use normalized text", value=True)
293
 
294
  # Get the text columns based on the toggle
295
- if use_normalized:
296
  ref_col, hyp_col = "norm_reference_text", "norm_predicted_text"
297
  else:
298
  ref_col, hyp_col = "reference_text", "predicted_text"
@@ -307,7 +367,10 @@ def main():
307
 
308
  # If no dataset column, try to infer from filename
309
  if uploaded_file is not None:
310
- filename_stem = Path(uploaded_file.name).stem
 
 
 
311
  dataset_name = filename_stem
312
 
313
  if not dataset_name and "dataset" in eval_results.columns:
 
1
  import os
 
 
 
 
2
  from datetime import datetime
3
  from pathlib import Path
4
+
5
+ import huggingface_hub
6
+ import jiwer
7
+ import pandas as pd
8
+ import requests
9
+ import streamlit as st
10
+ from huggingface_hub import HfFileSystem
11
+
12
  from st_fixed_container import st_fixed_container
 
13
  from visual_eval.evaluator import HebrewTextNormalizer
14
+ from visual_eval.visualization import render_visualize_jiwer_result_html
15
 
16
  HF_API_TOKEN = None
17
  try:
 
35
  if "audio_preview_active" not in st.session_state:
36
  st.session_state.audio_preview_active = {}
37
 
38
+ if "uploaded_file" not in st.session_state:
39
+ st.session_state.results_file = None
40
+
41
 
42
  def on_file_upload():
43
  st.session_state.audio_cache = {}
44
  st.session_state.audio_preview_active = {}
45
  st.session_state.selected_entry_idx = 0
46
+ st.session_state.results_file = None
47
+
48
+
49
+ @st.cache_data
50
+ def get_leaderboard_result_csv_paths(root_search_path):
51
+ fs = HfFileSystem(token=HF_API_TOKEN)
52
+ found_files = fs.glob(f"{root_search_path}/*/*.csv")
53
+ found_files_relative_paths = [f.split(root_search_path)[1] for f in found_files]
54
+ return found_files_relative_paths
55
+
56
+
57
+ @st.dialog("View Leaderboard Results")
58
+ def choose_input_file_from_leaderboard():
59
+ if not has_api_token:
60
+ st.rerun()
61
+
62
+ root_search_path = "ivrit-ai/hebrew-transcription-leaderboard/results"
63
+ fsspec_spaces_root_search_path = f"spaces/{root_search_path}"
64
+ found_files_relative_paths = get_leaderboard_result_csv_paths(
65
+ fsspec_spaces_root_search_path
66
+ )
67
+ selected_file = st.selectbox(
68
+ "Select a CSV file from the leaderboard:",
69
+ found_files_relative_paths,
70
+ index=None,
71
+ )
72
+
73
+ # Get the selected file
74
+ if selected_file:
75
+ paths_part = Path(selected_file).parent
76
+ file_part = Path(selected_file).name
77
+ uploaded_file = huggingface_hub.hf_hub_url(
78
+ repo_id="ivrit-ai/hebrew-transcription-leaderboard",
79
+ subfolder=f"results{paths_part}",
80
+ filename=file_part,
81
+ repo_type="space",
82
+ )
83
+
84
+ st.session_state.results_file = uploaded_file
85
+ on_file_upload()
86
+ st.rerun()
87
 
88
 
89
  def display_rtl(html):
 
259
 
260
  # File uploader
261
  uploaded_file = st.file_uploader(
262
+ "Upload evaluation results CSV",
263
+ type=["csv"],
264
+ on_change=on_file_upload,
265
+ key="uploaded_file",
266
  )
267
 
268
  if uploaded_file is not None:
269
+ st.session_state.results_file = uploaded_file
270
+
271
+ if st.session_state.results_file is None:
272
+ st.write("Or:")
273
+ if st.button("Choose from leaderboard"):
274
+ choose_input_file_from_leaderboard()
275
+
276
+ if st.session_state.results_file is not None:
277
+ uploaded_file = st.session_state.results_file
278
  # Load the data
279
  try:
280
  eval_results = pd.read_csv(uploaded_file)
 
352
  use_normalized = st.sidebar.toggle("Use normalized text", value=True)
353
 
354
  # Get the text columns based on the toggle
355
+ if use_normalized and "norm_reference_text" in eval_results.columns:
356
  ref_col, hyp_col = "norm_reference_text", "norm_predicted_text"
357
  else:
358
  ref_col, hyp_col = "reference_text", "predicted_text"
 
367
 
368
  # If no dataset column, try to infer from filename
369
  if uploaded_file is not None:
370
+ if isinstance(uploaded_file, str):
371
+ filename_stem = Path(uploaded_file).stem
372
+ else:
373
+ filename_stem = Path(uploaded_file.name).stem
374
  dataset_name = filename_stem
375
 
376
  if not dataset_name and "dataset" in eval_results.columns: