navidved commited on
Commit
47eb7ca
·
verified ·
1 Parent(s): 33d3ab3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +394 -417
app.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  import json
4
  import pandas as pd
5
  from datasets import load_dataset, DatasetDict, Dataset, Audio
6
- from huggingface_hub import HfApi, whoami, login, hf_hub_download
7
  import tempfile
8
  import shutil
9
  import gc
@@ -17,39 +17,44 @@ import numpy as np
17
  from pydantic import BaseModel
18
  from typing import Optional, List, Tuple
19
  from datetime import datetime
20
- import requests # Added for trim_audio_action
21
 
22
  # Log in with Hugging Face token
23
  token = os.getenv("hf_token")
24
  if token:
25
- login(token)
 
 
 
 
 
26
  else:
27
- print("Warning: hf_token environment variable not set. Hugging Face Hub operations might fail.")
28
 
29
  # Configuration
30
  HF_DATASET_NAME = "navidved/channelb-raw-data"
31
- AUDIO_DIR = "audio" # Not actively used if paths are absolute or in dataset item
32
- SAVE_PATH = "annotations.json"
33
  ALLOWED_USERS = ["shahab7", "amirnamini23", "Mohsen711", "mahya2025", "najmeh00", "sepehr21ar", "zahraemarati", "moghim72", "amin76", "vargha", "navidved"]
34
  REVIEWERS = ["vargha", "navidved"]
35
  ANNOTATORS = [user for user in ALLOWED_USERS if user not in REVIEWERS]
36
  CURRENT_USERNAME = None
37
  PAGE_SIZE = 100
38
- SAVE_INTERVAL = 10
 
39
 
40
  # --- SECOND PHASE CONFIGURATION ---
41
- SECOND_PHASE = False # Set to True to activate second phase review
42
- SECOND_PHASE_REVIEW_MAPPING = {} # Populated if SECOND_PHASE is True. Maps: reviewer_username -> original_annotator_username
43
 
44
  # Global state variables
45
- current_page = 0 # Stores the USER-RELATIVE page index
46
- # ds_iter = None # No longer maintained globally for streaming robustness
47
- current_page_data = None # Pandas DataFrame for the current page's data
48
- audio_backup = {} # For undo_trim, if needed (simplified)
49
- annotation_count = 0
50
  unsaved_changes = {}
51
- total_samples = 0 # Total samples in the HF_DATASET_NAME
52
- annotator_ranges = {} # Stores {annotator_username: (start_abs_idx, end_abs_idx)}
53
 
54
  # Pydantic data models
55
  class AudioTrim(BaseModel):
@@ -69,7 +74,7 @@ class Annotation(BaseModel):
69
  update_at: datetime
70
 
71
  class Sample(BaseModel):
72
- id: int # Absolute index in the dataset
73
  voice_name: str
74
  original_subtitle: str
75
  ignore_it: bool = False
@@ -84,89 +89,119 @@ class DatasetModel(BaseModel):
84
  def load_saved_annotations():
85
  dataset_model = None
86
  local_file_loaded_successfully = False
 
87
 
88
- # Phase 1: Try to load from local SAVE_PATH
89
  if os.path.exists(SAVE_PATH):
90
  try:
91
  with open(SAVE_PATH, "r", encoding="utf-8") as f:
92
  data = json.load(f)
93
- # Perform a basic check if data seems like a DatasetModel structure
94
- if "samples" in data or not data: # Allow empty dict for initially empty model
95
- dataset_model = DatasetModel(**data)
96
- print(f"Loaded annotations from local JSON file: {SAVE_PATH}")
97
- local_file_loaded_successfully = True
98
- else:
99
- print(f"Local JSON file {SAVE_PATH} does not seem to have the correct structure. Will ignore.")
100
- dataset_model = None # Explicitly set to None
101
  except Exception as e:
102
- print(f"Error loading local JSON file '{SAVE_PATH}': {str(e)}. Corrupt file? Will try HF Hub if token available, or create new.")
103
- # Optionally, rename corrupt local file to prevent repeated load errors from it
104
  try:
105
  corrupt_path = SAVE_PATH + ".corrupt." + datetime.now().strftime("%Y%m%d%H%M%S%f")
106
  os.rename(SAVE_PATH, corrupt_path)
107
  print(f"Renamed corrupt local file to {corrupt_path}")
108
  except OSError as re_e:
109
  print(f"Could not rename corrupt local file: {re_e}")
110
- dataset_model = None # Ensure it's None if local load failed
111
 
112
- # Phase 2: If local load failed or file didn't exist, and token is available, try HF Hub
113
- # Only attempt HF download if local load was not successful.
114
  if not local_file_loaded_successfully and token:
115
- print("Local annotations not loaded or not found/corrupt. Trying Hugging Face Hub...")
116
  try:
117
  hf_path = hf_hub_download(
118
  repo_id=HF_DATASET_NAME,
119
- filename=os.path.basename(SAVE_PATH), # Use basename in case SAVE_PATH is a full path
120
  repo_type="dataset",
121
  token=token
122
  )
123
  with open(hf_path, "r", encoding="utf-8") as f:
124
  data = json.load(f)
125
- dataset_model = DatasetModel(**data)
126
- # Cache it locally after successful download
127
  with open(SAVE_PATH, "w", encoding="utf-8") as f_cache:
128
  f_cache.write(dataset_model.model_dump_json(exclude_none=True, indent=4))
129
- print(f"Loaded annotations from HF dataset repository '{HF_DATASET_NAME}/{os.path.basename(SAVE_PATH)}' and cached locally to '{SAVE_PATH}'.")
130
- except Exception as e: # Catches HfHubHTTPError (like 404) and other issues
131
- print(f"Error loading JSON file from HF repo '{HF_DATASET_NAME}/{os.path.basename(SAVE_PATH)}': {str(e)}")
132
- # If HF load fails, dataset_model remains as it was (None if local also failed/absent)
 
 
 
 
133
 
134
- # Phase 3: If still no dataset_model (neither local nor HF Hub worked), create a new empty one
135
  if dataset_model is None:
136
  print("No valid annotations found locally or on HF Hub (or failed to load). Creating new empty DatasetModel.")
137
  dataset_model = DatasetModel(samples=[])
138
-
139
  return dataset_model
140
 
141
-
142
- def save_annotations(dataset_model: DatasetModel):
143
- global annotation_count
144
- try:
145
- with open(SAVE_PATH, "w", encoding="utf-8") as f:
146
- f.write(dataset_model.model_dump_json(exclude_none=True, indent=4))
147
- print(f"Saved annotations to {SAVE_PATH}")
148
- annotation_count += 1
149
- if annotation_count % SAVE_INTERVAL == 0 and token:
150
- push_json_to_hf()
151
- except Exception as e:
152
- print(f"Error saving annotations: {str(e)}")
153
-
154
  def push_json_to_hf():
 
 
 
155
  if not token:
156
- print("Cannot push to HF: token not available.")
 
 
 
 
 
 
 
 
 
 
157
  return
 
158
  try:
159
  api = HfApi()
160
  api.upload_file(
161
- path_or_fileobj=SAVE_PATH,
162
- path_in_repo=os.path.basename(SAVE_PATH), # Use basename
163
  repo_type="dataset",
164
  repo_id=HF_DATASET_NAME,
165
- token=token
 
166
  )
167
- print(f"Uploaded {os.path.basename(SAVE_PATH)} to Hugging Face repository {HF_DATASET_NAME}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  except Exception as e:
169
- print(f"Error uploading JSON file: {str(e)}")
 
 
 
 
170
 
171
  def calculate_annotator_ranges(total_samples_val, annotators_list):
172
  num_annotators = len(annotators_list)
@@ -184,7 +219,7 @@ def calculate_annotator_ranges(total_samples_val, annotators_list):
184
  end_idx += 1
185
  if end_idx >= total_samples_val:
186
  end_idx = total_samples_val -1
187
- if start_idx <= end_idx: # Ensure valid range
188
  ranges[annotator] = (start_idx, end_idx)
189
  start_idx = end_idx + 1
190
  print(f"Calculated annotator ranges: {ranges}")
@@ -209,7 +244,7 @@ def initialize_second_phase_assignments():
209
  SECOND_PHASE_REVIEW_MAPPING[annotator] = annotator
210
  print(f"Second phase: {annotator} will review their own work.")
211
  else:
212
- for i, reviewer_user in enumerate(ANNOTATORS): # In 2nd phase, ANNOTATORS become reviewers of other ANNOTATORS
213
  original_annotator_idx = (i - 1 + len(ANNOTATORS)) % len(ANNOTATORS)
214
  original_annotator_user = ANNOTATORS[original_annotator_idx]
215
  SECOND_PHASE_REVIEW_MAPPING[reviewer_user] = original_annotator_user
@@ -220,24 +255,31 @@ def initialize_second_phase_assignments():
220
  print(f"Warning: Original annotator {original_annotator} (being reviewed by {reviewer}) has no range defined in annotator_ranges.")
221
 
222
  def get_user_allowed_range(username):
223
- global annotator_ranges, total_samples
224
  if SECOND_PHASE:
225
- if not SECOND_PHASE_REVIEW_MAPPING:
226
- initialize_second_phase_assignments()
 
 
 
227
 
228
  original_annotator_to_review = SECOND_PHASE_REVIEW_MAPPING.get(username)
229
  if original_annotator_to_review:
230
- if not annotator_ranges and total_samples > 0:
 
231
  annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
232
-
233
  user_range = annotator_ranges.get(original_annotator_to_review)
234
  return user_range
235
- else:
236
- return None
237
  else: # First Phase Logic
238
  if get_user_role(username) == "reviewer":
239
  return (0, total_samples - 1) if total_samples > 0 else None
240
- elif username in annotator_ranges:
 
 
 
 
241
  return annotator_ranges[username]
242
  else:
243
  return None
@@ -255,6 +297,7 @@ def get_dataset_info():
255
  if total_samples > 0:
256
  return {'num_samples': total_samples}
257
  try:
 
258
  ds_info_obj = load_dataset(HF_DATASET_NAME, split="train", streaming=False)
259
  num_samples_val = ds_info_obj.num_rows
260
  if num_samples_val and num_samples_val > 0:
@@ -262,11 +305,12 @@ def get_dataset_info():
262
  print(f"Dataset info: total_samples set to {total_samples}")
263
  return {'num_samples': total_samples}
264
  else:
265
- print("Warning: ds_info_obj.num_rows was not positive. Trying iteration for count (may be slow).")
266
  ds_stream = load_dataset(HF_DATASET_NAME, split="train", streaming=True)
267
  count = 0
268
- for _ in ds_stream:
269
  count +=1
 
270
  if count > 0:
271
  total_samples = count
272
  print(f"Dataset info: total_samples set to {total_samples} by iteration.")
@@ -276,19 +320,20 @@ def get_dataset_info():
276
  total_samples = -1
277
  return {'num_samples': -1}
278
  except Exception as e:
279
- print(f"Error getting dataset info: {e}")
280
  total_samples = -1
281
  return {'num_samples': -1}
282
 
283
- # Initial data load (moved after functions it calls are defined)
284
- dataset_info = get_dataset_info() # This sets global total_samples
285
- if total_samples > 0:
286
- annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
287
- if SECOND_PHASE:
288
- initialize_second_phase_assignments()
289
- else:
290
- print("Warning: total_samples is not positive. Annotation ranges and second phase assignments may be incorrect.")
291
- annotator_ranges = {}
 
292
 
293
  def get_audio_path(audio_entry):
294
  if isinstance(audio_entry, dict):
@@ -307,11 +352,9 @@ def get_audio_path(audio_entry):
307
  return audio_entry
308
  return None
309
 
310
- # --- MODIFIED FUNCTION ---
311
  def load_page_data(page_num_within_user_view=0):
312
  global current_page_data, current_page
313
 
314
- # Default to empty DataFrame; current_page updated to reflect attempt
315
  current_page_data = pd.DataFrame(columns=["audio", "sentence", "id_within_page", "absolute_idx"])
316
  current_page = page_num_within_user_view
317
 
@@ -326,24 +369,18 @@ def load_page_data(page_num_within_user_view=0):
326
  print(f"User {CURRENT_USERNAME} has an invalid allowed range: {user_allowed_range}")
327
  return current_page_data
328
 
329
- # Calculate the global start index for this page based on user's view
330
  page_global_start_idx = user_start_abs + (page_num_within_user_view * PAGE_SIZE)
331
 
332
  if page_global_start_idx > user_end_abs:
333
  print(f"Requested page {page_num_within_user_view} (abs start {page_global_start_idx}) is beyond user {CURRENT_USERNAME}'s allowed samples end ({user_end_abs}).")
334
- return current_page_data # Return empty DataFrame
335
 
336
- # Calculate the global end index for this page, capped by user's total assigned samples
337
  page_global_end_idx = min(page_global_start_idx + PAGE_SIZE - 1, user_end_abs)
338
-
339
- # Calculate how many samples are actually on this page
340
  num_samples_on_this_page = page_global_end_idx - page_global_start_idx + 1
341
 
342
  if num_samples_on_this_page <= 0:
343
- # This might happen if page_global_start_idx is valid but page_global_end_idx calculation results in non-positive samples
344
- # (e.g. page_global_start_idx is exactly user_end_abs + 1 after rounding from a large page_num_within_user_view)
345
  print(f"No samples for user {CURRENT_USERNAME} on their page {page_num_within_user_view}. Calculated range for page: [{page_global_start_idx}-{page_global_end_idx}]")
346
- return current_page_data # Return empty DataFrame
347
 
348
  print(f"Loading page {page_num_within_user_view} for user {CURRENT_USERNAME}. "
349
  f"Effective absolute dataset range for this page: [{page_global_start_idx}-{page_global_end_idx}] "
@@ -351,29 +388,21 @@ def load_page_data(page_num_within_user_view=0):
351
  f"Will attempt to load {num_samples_on_this_page} samples.")
352
 
353
  try:
354
- ds_full = load_dataset(HF_DATASET_NAME, split="train", streaming=True)
355
- # Efficiently skip to the start of the page
356
  ds_page_specific = ds_full.skip(page_global_start_idx)
357
- # Take only the samples needed for this page
358
  page_iterable = ds_page_specific.take(num_samples_on_this_page)
359
  except Exception as e:
360
  print(f"Error loading or processing dataset via skip/take for page data: {e}")
361
- # current_page_data is already empty from the top, current_page updated
362
  return current_page_data
363
 
364
  samples_on_page_list = []
365
- # The absolute index for items coming from page_iterable will start at page_global_start_idx
366
  current_processing_abs_idx = page_global_start_idx
367
 
368
- # Iterate through the samples for the current page
369
  for id_on_page_counter, sample_data_item in enumerate(page_iterable):
370
  sample_data_item['absolute_idx'] = current_processing_abs_idx
371
- sample_data_item['id_within_page'] = id_on_page_counter # 0-indexed for the page
372
  samples_on_page_list.append(sample_data_item)
373
-
374
  current_processing_abs_idx += 1
375
-
376
- # Safety break: if we've collected enough samples for the page
377
  if id_on_page_counter + 1 >= num_samples_on_this_page:
378
  break
379
 
@@ -383,17 +412,26 @@ def load_page_data(page_num_within_user_view=0):
383
  f"First abs_idx: {samples_on_page_list[0]['absolute_idx']}, "
384
  f"Last abs_idx: {samples_on_page_list[-1]['absolute_idx']}.")
385
  else:
386
- # This case might occur if .take() returns fewer items than expected (e.g., dataset is shorter than total_samples indicated)
387
  print(f"No samples were loaded for page {page_num_within_user_view} (user: {CURRENT_USERNAME}) "
388
- f"despite expecting {num_samples_on_this_page} from range [{page_global_start_idx}-{page_global_end_idx}]. "
389
- f"This could mean the source dataset is shorter than anticipated or an issue with streaming/take().")
390
- # current_page_data remains empty
391
 
392
  gc.collect()
393
  return current_page_data
394
- # --- END OF MODIFIED FUNCTION ---
395
 
396
- # Core functions
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  def save_sample_data(page_idx, idx_on_page, transcript, current_user_performing_action, accepted_flag=False):
398
  global current_page_data, unsaved_changes
399
 
@@ -411,7 +449,7 @@ def save_sample_data(page_idx, idx_on_page, transcript, current_user_performing_
411
  audio_entry_original = actual_sample_info["audio"]
412
  voice_name = os.path.basename(str(get_audio_path(audio_entry_original) or f"sample_{absolute_idx}"))
413
 
414
- dataset_model = load_saved_annotations()
415
  sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
416
 
417
  if not sample:
@@ -462,7 +500,7 @@ def save_sample_data(page_idx, idx_on_page, transcript, current_user_performing_
462
  if absolute_idx in unsaved_changes:
463
  del unsaved_changes[absolute_idx]
464
 
465
- save_annotations(dataset_model)
466
  return f"✓ Saved annotation for sample {absolute_idx}"
467
 
468
  def handle_second_phase_action(page_idx, idx_on_page, action: str):
@@ -491,7 +529,7 @@ def handle_second_phase_action(page_idx, idx_on_page, action: str):
491
  print(f"Warning: No prior annotation by {original_annotator_to_review} for sample {absolute_idx}. Creating placeholder for review.")
492
  annotation_to_review = Annotation(
493
  annotator=original_annotator_to_review,
494
- annotated_subtitle=sample.original_subtitle,
495
  create_at=datetime.now(),
496
  update_at=datetime.now()
497
  )
@@ -505,8 +543,7 @@ def handle_second_phase_action(page_idx, idx_on_page, action: str):
505
 
506
  if action == "approved":
507
  sample.is_approved_in_second_phase = True
508
- # else:
509
- # sample.is_approved_in_second_phase = False
510
 
511
  save_annotations(dataset_model)
512
  return f"✓ Review ({action}) saved for sample {absolute_idx} (Original annotator: {original_annotator_to_review})"
@@ -515,6 +552,7 @@ def get_sample(page_idx_user_relative, idx_on_page, current_user_displaying):
515
  global current_page_data, total_samples
516
 
517
  if current_page_data is None or idx_on_page < 0 or idx_on_page >= len(current_page_data):
 
518
  return None, "", f"Invalid index ({idx_on_page}) for current page data (len {len(current_page_data) if current_page_data is not None else 'None'}).", "unreviewed", "white", True, False, "", gr.update(visible=False)
519
 
520
  actual_sample_info = current_page_data.iloc[idx_on_page]
@@ -523,23 +561,28 @@ def get_sample(page_idx_user_relative, idx_on_page, current_user_displaying):
523
  audio_entry_original = actual_sample_info["audio"]
524
  audio_val = get_audio_path(audio_entry_original)
525
 
526
- default_transcript = actual_sample_info["sentence"]
527
  transcript_to_display = default_transcript
528
 
529
  ui_reviewer_field = "unreviewed"
530
  ui_color = "white"
531
  ui_editable = True
532
- ui_is_accepted_flag = False # For 1st phase reviewer checkbox state
533
 
534
  status_prefix = ""
535
  user_allowed_range = get_user_allowed_range(current_user_displaying)
536
  if user_allowed_range:
537
  user_start_abs, user_end_abs = user_allowed_range
538
- current_sample_num_in_user_assignment = absolute_idx - user_start_abs + 1
539
- total_samples_for_user = user_end_abs - user_start_abs + 1
540
- status_prefix = f"Sample {current_sample_num_in_user_assignment} of {total_samples_for_user} for you (Abs Idx {absolute_idx})."
 
 
 
 
 
541
  else:
542
- status_prefix = f"Sample (Abs Idx {absolute_idx})."
543
 
544
  dataset_model = load_saved_annotations()
545
  sample_from_json = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
@@ -566,7 +609,6 @@ def get_sample(page_idx_user_relative, idx_on_page, current_user_displaying):
566
 
567
  if annotation_under_review:
568
  transcript_to_display = annotation_under_review.annotated_subtitle or default_transcript
569
- # ui_is_accepted_flag refers to the 2nd phase review status by *this* reviewer
570
  ui_is_accepted_flag = (annotation_under_review.second_phase_review_status == "approved" and
571
  annotation_under_review.second_phase_reviewed_by == current_user_displaying)
572
 
@@ -577,7 +619,7 @@ def get_sample(page_idx_user_relative, idx_on_page, current_user_displaying):
577
  ui_color = "gray"
578
  ui_reviewer_field += f" (Already reviewed by {annotation_under_review.second_phase_reviewed_by} as {annotation_under_review.second_phase_review_status})"
579
  else:
580
- ui_color = "yellow" # Pending this user's review
581
  else:
582
  transcript_to_display = default_transcript
583
  ui_reviewer_field += " (No submission by original annotator)"
@@ -590,7 +632,7 @@ def get_sample(page_idx_user_relative, idx_on_page, current_user_displaying):
590
  transcript_to_display = accepted_first_phase_annotation.annotated_subtitle or default_transcript
591
  ui_reviewer_field = f"Accepted by: {accepted_first_phase_annotation.first_phase_reviewer_username}"
592
  ui_color = "green"
593
- ui_is_accepted_flag = True # Checkbox state reflects this acceptance
594
  ui_editable = (get_user_role(current_user_displaying) == "reviewer")
595
  else:
596
  user_specific_annotation = next((a for a in sample_from_json.annotations or [] if a.annotator == current_user_displaying), None)
@@ -599,25 +641,21 @@ def get_sample(page_idx_user_relative, idx_on_page, current_user_displaying):
599
  ui_reviewer_field = f"Your draft (as {user_specific_annotation.annotator})"
600
  ui_color = "yellow"
601
  ui_editable = True
602
- # ui_is_accepted_flag remains False, as it's user's own draft, not yet "accepted" by a reviewer role
603
- else: # No accepted ann, no user-specific ann. Check for other unaccepted annotations.
604
  other_annotations = [a for a in sample_from_json.annotations or [] if not a.is_first_phase_accepted]
605
  if other_annotations:
606
  if get_user_role(current_user_displaying) == "reviewer":
607
- other_ann_to_show = other_annotations[0] # Show first other draft
608
  transcript_to_display = other_ann_to_show.annotated_subtitle or default_transcript
609
  ui_reviewer_field = f"Draft by: {other_ann_to_show.annotator}"
610
  ui_color = "blue"
611
  ui_editable = True
612
- # ui_is_accepted_flag remains False (reviewer is seeing other's draft)
613
- else: # Annotator sees original if not their work and not accepted
614
  transcript_to_display = default_transcript
615
  ui_reviewer_field = f"Labeled by: {other_annotations[0].annotator}"
616
  ui_color = "lightblue"
617
- ui_editable = False # Annotator cannot edit another annotator's unreviewed work
618
- # else: default_transcript, "unreviewed", "white", editable=True already set
619
 
620
- # Override color for unsaved changes visual cue in first phase
621
  if not SECOND_PHASE and absolute_idx in unsaved_changes:
622
  ui_color = "pink"
623
 
@@ -627,24 +665,24 @@ def get_sample(page_idx_user_relative, idx_on_page, current_user_displaying):
627
  else:
628
  ui_status_message += " (Annotation Phase)"
629
 
630
- # Determine visibility of the "Accept" checkbox (only for reviewers in 1st phase)
631
  show_accept_checkbox = not SECOND_PHASE and get_user_role(current_user_displaying) == "reviewer"
632
 
633
  return audio_val, transcript_to_display, ui_status_message, ui_reviewer_field, ui_color, ui_editable, ui_is_accepted_flag, default_transcript, gr.update(visible=show_accept_checkbox)
634
 
635
  def load_interface_data(page_idx_user_relative, idx_on_page):
636
- audio, text, base_status, saved_reviewer_text, color, editable, accepted_flag, original_dataset_text, accept_cb_update = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
 
637
 
638
  return (
639
- page_idx_user_relative,
640
- idx_on_page,
641
- audio,
642
- gr.update(value=text, interactive=editable),
643
- gr.update(value=saved_reviewer_text, elem_classes=[color]),
644
- base_status,
645
- original_dataset_text,
646
- accept_cb_update, # For the first_phase_accept_cb visibility
647
- accepted_flag # For the first_phase_accept_cb value state
648
  )
649
 
650
  def navigate_sample(page_idx_user_relative, idx_on_page, direction: int):
@@ -653,9 +691,10 @@ def navigate_sample(page_idx_user_relative, idx_on_page, direction: int):
653
  if current_page_data is None or len(current_page_data) == 0:
654
  user_allowed_range = get_user_allowed_range(CURRENT_USERNAME)
655
  err_msg = "No data loaded. Try reloading or check your assigned range."
656
- if not user_allowed_range or user_allowed_range[0] > user_allowed_range[1]:
657
  err_msg = "You have no samples assigned or your range is invalid."
658
 
 
659
  return page_idx_user_relative, idx_on_page, None, gr.update(value="Error", interactive=False), gr.update(value="Error"), err_msg, "", gr.update(visible=False), False
660
 
661
 
@@ -664,45 +703,50 @@ def navigate_sample(page_idx_user_relative, idx_on_page, direction: int):
664
  new_idx_on_page = target_idx_on_page
665
 
666
  user_allowed_range = get_user_allowed_range(CURRENT_USERNAME)
 
667
  if not user_allowed_range:
668
- audio, text, status, rev, color, edit, acc_flag, orig_text, cb_vis_update = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
669
- return page_idx_user_relative, idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), "Error: No allowed range for navigation.", orig_text, cb_vis_update, acc_flag
 
 
670
 
671
 
672
- if target_idx_on_page < 0:
673
  if page_idx_user_relative > 0:
674
  new_page_idx_user_relative = page_idx_user_relative - 1
675
  temp_data = load_page_data(new_page_idx_user_relative)
676
  if temp_data is not None and not temp_data.empty:
677
  new_idx_on_page = len(temp_data) - 1
678
- else:
679
- audio, text, status, rev, color, edit, acc_flag, orig_text, cb_vis_update = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
680
- status = status + " [Already at the first sample of this page/range]"
681
- return page_idx_user_relative, idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status, orig_text, cb_vis_update, acc_flag
682
- else:
683
- audio, text, status, rev, color, edit, acc_flag, orig_text, cb_vis_update = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
684
- status = status + " [At the beginning of your assigned samples]"
685
- return page_idx_user_relative, idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status, orig_text, cb_vis_update, acc_flag
686
-
687
- elif target_idx_on_page >= len(current_page_data):
688
  new_page_idx_user_relative = page_idx_user_relative + 1
689
  temp_data = load_page_data(new_page_idx_user_relative)
690
  if temp_data is not None and not temp_data.empty:
691
  new_idx_on_page = 0
692
- else:
693
  current_abs_idx_check = -1
694
  if current_page_data is not None and not current_page_data.empty and idx_on_page < len(current_page_data):
695
  current_abs_idx_check = current_page_data.iloc[idx_on_page]['absolute_idx']
696
 
697
  is_at_very_end = user_allowed_range and current_abs_idx_check != -1 and current_abs_idx_check >= user_allowed_range[1]
698
 
699
- audio, text, status, rev, color, edit, acc_flag, orig_text, cb_vis_update = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
 
700
  if is_at_very_end:
701
- status = status + " [At the end of your assigned samples]"
702
  else:
703
- status = status + " [No more samples in this direction (next page empty or end of assignment)]"
704
- return page_idx_user_relative, idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status, orig_text, cb_vis_update, acc_flag
705
 
 
706
  return load_interface_data(new_page_idx_user_relative, new_idx_on_page)
707
 
708
  def go_next_sample_wrapper(page_idx_user_relative, idx_on_page):
@@ -727,6 +771,11 @@ def review_and_next_sample_second_phase(page_idx_user_relative, idx_on_page, rev
727
 
728
  def jump_to_absolute_idx(target_abs_idx_str, current_page_idx_user_relative, current_idx_on_page):
729
  global current_page_data
 
 
 
 
 
730
 
731
  try:
732
  target_abs_idx = int(target_abs_idx_str)
@@ -734,63 +783,53 @@ def jump_to_absolute_idx(target_abs_idx_str, current_page_idx_user_relative, cur
734
 
735
  user_allowed_range = get_user_allowed_range(CURRENT_USERNAME)
736
  if not user_allowed_range or not is_within_range(target_abs_idx, user_allowed_range):
737
- status_msg = f"Target index {target_abs_idx} is outside your assigned range {user_allowed_range or 'N/A'}."
738
- audio, text, _, rev, color, edit, acc, orig_txt, cb_vis_update = get_sample(current_page_idx_user_relative, current_idx_on_page, CURRENT_USERNAME)
739
- return current_page_idx_user_relative, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt, cb_vis_update, acc
740
 
741
  user_start_abs, _ = user_allowed_range
742
  offset_from_user_start = target_abs_idx - user_start_abs
743
 
744
  if offset_from_user_start < 0:
745
- status_msg = f"Logic Error: Target index {target_abs_idx} has negative offset from user start {user_start_abs}."
746
- print(status_msg)
747
- audio, text, _, rev, color, edit, acc, orig_txt, cb_vis_update = get_sample(current_page_idx_user_relative, current_idx_on_page, CURRENT_USERNAME)
748
- return current_page_idx_user_relative, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt, cb_vis_update, acc
749
 
750
  new_user_relative_page_idx = offset_from_user_start // PAGE_SIZE
 
751
  temp_page_data_df = load_page_data(new_user_relative_page_idx)
752
 
753
  if temp_page_data_df is None or temp_page_data_df.empty:
754
- status_msg = f"No data found for your page {new_user_relative_page_idx} (containing abs index {target_abs_idx})."
755
- audio, text, _, rev, color, edit, acc, orig_txt, cb_vis_update = get_sample(current_page_idx_user_relative, current_idx_on_page, CURRENT_USERNAME)
756
- return current_page_idx_user_relative, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt, cb_vis_update, acc
757
-
758
- # current_page_data is now updated by load_page_data
759
- matching_rows = current_page_data[current_page_data['absolute_idx'] == target_abs_idx]
760
- if not matching_rows.empty:
761
- new_idx_on_page_actual = matching_rows.iloc[0]['id_within_page']
762
- else:
763
- # This can happen if target_abs_idx is within the range of the loaded page, but not exactly matching an item
764
- # (e.g., if the dataset is shorter than expected and the page didn't fill up to target_abs_idx)
765
- # Or, if target_abs_idx is valid but not the first item on the page.
766
- # The id_within_page calculation from offset_from_user_start % PAGE_SIZE might be more direct:
767
- new_idx_on_page_actual = offset_from_user_start % PAGE_SIZE
768
- if new_idx_on_page_actual >= len(current_page_data): # If calculated index is out of bounds for the loaded page
769
- status_msg = f"Index {target_abs_idx} on your page {new_user_relative_page_idx} is out of bounds for loaded data. Displaying start of page."
770
- print(status_msg)
771
- new_idx_on_page_actual = 0
772
- if current_page_data.empty: # Should have been caught by temp_page_data_df check
773
- audio, text, _, rev, color, edit, acc, orig_txt, cb_vis_update = get_sample(current_page_idx_user_relative, current_idx_on_page, CURRENT_USERNAME)
774
- return current_page_idx_user_relative, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg + " (Page empty after load)", orig_txt, cb_vis_update, acc
775
 
776
  return load_interface_data(new_user_relative_page_idx, new_idx_on_page_actual)
777
 
778
  except ValueError:
779
- status_msg = "Invalid index format for jump."
780
  except Exception as e:
781
  import traceback
782
- status_msg = f"Error jumping to index: {str(e)}"
783
- print(f"{status_msg}\n{traceback.format_exc()}")
784
 
785
- # Fallback for errors
786
- audio, text, _, rev, color, edit, acc, orig_txt, cb_vis_update = get_sample(current_page_idx_user_relative, current_idx_on_page, CURRENT_USERNAME)
787
- return current_page_idx_user_relative, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt, cb_vis_update, acc
788
 
789
  def trim_audio_action(page_idx_user_relative, idx_on_page, trim_start_str, trim_end_str):
790
- # Helper to format returns, ensuring all outputs for navigation_outputs_extended are present
791
  def _return_current_state_with_message(msg_suffix):
792
- page_idx, current_idx, audio_val, transcript_update, reviewer_update, status_val, orig_text_val, cb_vis_update, cb_val = load_interface_data(page_idx_user_relative, idx_on_page)
793
- return page_idx, current_idx, audio_val, transcript_update, reviewer_update, status_val + f" [{msg_suffix}]", orig_text_val, cb_vis_update, cb_val
794
 
795
  if SECOND_PHASE: return _return_current_state_with_message("Trimming disabled in Review Phase.")
796
 
@@ -799,36 +838,27 @@ def trim_audio_action(page_idx_user_relative, idx_on_page, trim_start_str, trim_
799
 
800
  actual_sample_info = current_page_data.iloc[idx_on_page]
801
  absolute_idx = actual_sample_info['absolute_idx']
802
-
803
  original_audio_path_info = get_audio_path(actual_sample_info["audio"])
804
  source_basename_for_trimmed_file = os.path.basename(str(original_audio_path_info)) if isinstance(original_audio_path_info, str) else f"sample_raw_data_{absolute_idx}"
805
-
806
  audio_seg = None
807
  temp_dir_for_download = None
808
 
809
  try:
810
- if isinstance(original_audio_path_info, tuple): # Raw data (sr, array)
811
  sr, audio_array = original_audio_path_info
812
  if not isinstance(audio_array, np.ndarray): return _return_current_state_with_message("Raw audio data is not a numpy array.")
813
  if audio_array.size == 0: return _return_current_state_with_message("Cannot trim empty audio array.")
814
-
815
- # Ensure contiguous array and correct channel interpretation (assuming mono or stereo)
816
  audio_array = np.ascontiguousarray(audio_array)
817
  channels = 1 if audio_array.ndim == 1 else (audio_array.shape[1] if audio_array.ndim == 2 and audio_array.shape[1] in [1,2] else (audio_array.shape[0] if audio_array.ndim == 2 and audio_array.shape[0] in [1,2] else 0))
818
  if channels == 0: return _return_current_state_with_message(f"Unsupported audio array shape or channels: {audio_array.shape}")
819
- if audio_array.ndim == 2 and audio_array.shape[0] < audio_array.shape[1] and audio_array.shape[0] in [1, 2]: # if channels are rows
820
- audio_array = np.ascontiguousarray(audio_array.T) # Transpose to (samples, channels)
821
-
822
- # Convert to int16 for AudioSegment
823
  if audio_array.dtype == np.float32 or audio_array.dtype == np.float64: audio_array_int = (audio_array * np.iinfo(np.int16).max).astype(np.int16)
824
  elif audio_array.dtype == np.int16: audio_array_int = audio_array
825
- elif audio_array.dtype == np.int32: audio_array_int = (audio_array >> 16).astype(np.int16) # Approximate if int32
826
  else: return _return_current_state_with_message(f"Unsupported numpy array dtype for raw audio: {audio_array.dtype}")
827
-
828
  sample_width = audio_array_int.itemsize
829
  audio_seg = AudioSegment(data=audio_array_int.tobytes(), sample_width=sample_width, frame_rate=sr, channels=channels)
830
-
831
- elif isinstance(original_audio_path_info, str): # Path string or URL
832
  audio_to_load = original_audio_path_info
833
  if not (os.path.exists(audio_to_load) or audio_to_load.startswith("http")): return _return_current_state_with_message("Audio file path is invalid, does not exist, or is not a valid URL.")
834
  if audio_to_load.startswith("http"):
@@ -841,48 +871,44 @@ def trim_audio_action(page_idx_user_relative, idx_on_page, trim_start_str, trim_
841
  audio_seg = AudioSegment.from_file(audio_to_load)
842
  else:
843
  return _return_current_state_with_message("Trimming not supported for this audio source.")
844
-
845
  if audio_seg is None: return _return_current_state_with_message("Failed to load audio segment.")
846
-
847
  try: start_s, end_s = float(trim_start_str), float(trim_end_str)
848
  except ValueError: return _return_current_state_with_message("Invalid trim times: Start and End must be numbers.")
849
  start_ms, end_ms, audio_duration_ms = int(start_s * 1000), int(end_s * 1000), len(audio_seg)
850
  if not (0 <= start_ms < end_ms and end_ms <= audio_duration_ms):
851
- return _return_current_state_with_message(f"Invalid trim times: start={start_s}s, end={end_s}s for audio of {audio_duration_ms/1000.0:.2f}s. Ensure 0 <= start < end <= duration.")
852
-
853
  trimmed_seg = audio_seg[start_ms:end_ms]
854
  os.makedirs("trimmed_audio", exist_ok=True)
855
  safe_voice_name = re.sub(r'[^\w.-]', '_', source_basename_for_trimmed_file)
856
  trimmed_filename = f"trimmed_{absolute_idx}_{safe_voice_name}"
857
- if not os.path.splitext(trimmed_filename)[1]: trimmed_filename += ".wav" # Default to wav if no extension
858
  trimmed_path = os.path.join("trimmed_audio", trimmed_filename)
859
  export_format = os.path.splitext(trimmed_path)[1][1:].lower() or "wav"
860
  trimmed_seg.export(trimmed_path, format=export_format)
861
-
862
  dataset_model = load_saved_annotations()
863
  sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
864
-
865
  if not sample:
866
  sample = Sample(id=absolute_idx, voice_name=os.path.basename(str(get_audio_path(actual_sample_info["audio"]) or f"sample_{absolute_idx}")),
867
  original_subtitle=actual_sample_info["sentence"], annotations=[])
868
  dataset_model.samples = dataset_model.samples or []
869
  dataset_model.samples.append(sample)
870
-
871
  now = datetime.now()
872
  annotation = next((a for a in sample.annotations or [] if a.annotator == CURRENT_USERNAME), None)
873
  if not annotation:
874
  annotation = Annotation(annotator=CURRENT_USERNAME, create_at=now, update_at=now)
875
  sample.annotations = sample.annotations or []
876
  sample.annotations.append(annotation)
877
-
878
  annotation.audio_trims = [AudioTrim(start=start_s, end=end_s)]
879
  annotation.update_at = now
880
  save_annotations(dataset_model)
881
 
882
  # Return full state, but with new audio path and status message
883
- page_idx, current_idx, _, transcript_update, reviewer_update, status_val, orig_text_val, cb_vis_update, cb_val = load_interface_data(page_idx_user_relative, idx_on_page)
884
- return page_idx, current_idx, trimmed_path, transcript_update, reviewer_update, status_val + " [Trimmed]", orig_text_val, cb_vis_update, cb_val
885
-
 
 
 
886
  except Exception as e:
887
  import traceback
888
  print(f"Error during trim_audio_action for abs_idx {absolute_idx}: {str(e)}\n{traceback.format_exc()}")
@@ -893,11 +919,11 @@ def trim_audio_action(page_idx_user_relative, idx_on_page, trim_start_str, trim_
893
 
894
  def undo_trim_action(page_idx_user_relative, idx_on_page):
895
  def _return_current_state_with_message(msg_suffix):
896
- page_idx, current_idx, audio_val, transcript_update, reviewer_update, status_val, orig_text_val, cb_vis_update, cb_val = load_interface_data(page_idx_user_relative, idx_on_page)
897
- return page_idx, current_idx, audio_val, transcript_update, reviewer_update, status_val + f" [{msg_suffix}]", orig_text_val, cb_vis_update, cb_val
 
898
 
899
  if SECOND_PHASE: return _return_current_state_with_message("Undo Trim disabled in Review Phase.")
900
-
901
  if current_page_data is None or idx_on_page < 0 or idx_on_page >= len(current_page_data):
902
  return _return_current_state_with_message("Audio data not available (page error).")
903
 
@@ -910,42 +936,27 @@ def undo_trim_action(page_idx_user_relative, idx_on_page):
910
  annotation.audio_trims = None
911
  annotation.update_at = datetime.now()
912
  save_annotations(dataset_model)
913
-
914
- # Reload interface to show original audio. get_sample will handle finding the original path.
915
- return _return_current_state_with_message("Trim undone")
916
-
917
 
918
  def confirm_delete_audio_action(page_idx_user_relative, idx_on_page):
919
- def _return_current_state_with_message(msg_suffix): # Not really used here as load_interface_data is called directly
920
- page_idx, current_idx, audio_val, transcript_update, reviewer_update, status_val, orig_text_val, cb_vis_update, cb_val = load_interface_data(page_idx_user_relative, idx_on_page)
921
- return page_idx, current_idx, audio_val, transcript_update, reviewer_update, status_val + f" [{msg_suffix}]", orig_text_val, cb_vis_update, cb_val
922
-
923
- if SECOND_PHASE:
924
- # Instead of custom message, just call load_interface_data which handles phase display
925
  loaded_data = load_interface_data(page_idx_user_relative, idx_on_page)
926
- return (*loaded_data[0:5], loaded_data[5] + " [Delete disabled in Review Phase]", *loaded_data[6:])
927
-
928
 
 
 
929
  if current_page_data is None or idx_on_page < 0 or idx_on_page >= len(current_page_data):
930
- loaded_data = load_interface_data(page_idx_user_relative, idx_on_page) # Will show error from get_sample
931
- return (*loaded_data[0:5], loaded_data[5] + " [Audio data not available (page error)]", *loaded_data[6:])
932
-
933
 
934
  absolute_idx = current_page_data.iloc[idx_on_page]['absolute_idx']
935
  voice_name_original = os.path.basename(str(get_audio_path(current_page_data.iloc[idx_on_page]["audio"]) or f"sample_{absolute_idx}"))
936
-
937
  dataset_model = load_saved_annotations()
938
  sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
939
  if not sample:
940
- sample = Sample(
941
- id=absolute_idx,
942
- voice_name=voice_name_original,
943
- original_subtitle=current_page_data.iloc[idx_on_page]["sentence"],
944
- annotations=[]
945
- )
946
  dataset_model.samples = dataset_model.samples or []
947
  dataset_model.samples.append(sample)
948
-
949
  sample.ignore_it = True
950
  now = datetime.now()
951
  deleted_text_marker = "AUDIO DELETED (This audio has been removed.)"
@@ -959,12 +970,8 @@ def confirm_delete_audio_action(page_idx_user_relative, idx_on_page):
959
  sample.annotations = sample.annotations or []
960
  sample.annotations.append(annotation)
961
  save_annotations(dataset_model)
 
962
 
963
- # After deleting, reload the interface for this item. get_sample will handle the "deleted" display.
964
- # load_interface_data returns a 9-tuple
965
- return load_interface_data(page_idx_user_relative, idx_on_page)
966
-
967
- # Export functions
968
  def sanitize_string(s):
969
  if not isinstance(s, str): s = str(s)
970
  return re.sub(r'[^\w-./]', '_', s)
@@ -979,7 +986,7 @@ def push_to_hub_with_retry(dataset_dict, repo_id, private=True, token_val=None):
979
  print("Cannot push to hub: No token provided for push_to_hub_with_retry.")
980
  return
981
  print(f"Pushing dataset to {repo_id}")
982
- dataset_dict.push_to_hub(repo_id, private=private, token=token_val)
983
 
984
  def export_to_huggingface(repo_name_str, hf_token_for_export, progress=gr.Progress()):
985
  if not hf_token_for_export:
@@ -993,27 +1000,18 @@ def export_to_huggingface(repo_name_str, hf_token_for_export, progress=gr.Progre
993
 
994
  dataset_model_annotations = load_saved_annotations()
995
 
996
- # Use global total_samples, try to fetch if not set
997
  current_total_samples = total_samples
998
  if current_total_samples <= 0:
999
- info = get_dataset_info() # This updates global total_samples
1000
  current_total_samples = total_samples
1001
  if current_total_samples <= 0:
1002
  return "Export failed: Total number of samples is unknown or invalid."
1003
 
1004
- ds_source = load_dataset(HF_DATASET_NAME, split="train", streaming=False)
1005
 
1006
- # Verify if ds_source.num_rows matches current_total_samples
1007
- if hasattr(ds_source, 'num_rows') and ds_source.num_rows != current_total_samples:
1008
- print(f"Warning: Source dataset num_rows ({ds_source.num_rows}) mismatches cached total_samples ({current_total_samples}). Using source num_rows for iteration count if smaller.")
1009
- # Decide which count to trust or how to reconcile. For now, iterate up to the smaller of the two.
1010
- # iteration_limit = min(ds_source.num_rows, current_total_samples) if ds_source.num_rows else current_total_samples
1011
- # For safety, iterate through ds_source and use its length
1012
- iteration_limit = len(ds_source) # For non-streaming, len() is reliable.
1013
- if iteration_limit != current_total_samples:
1014
- print(f"Adjusting iteration limit for export to {iteration_limit} based on loaded source dataset length.")
1015
- else:
1016
- iteration_limit = current_total_samples
1017
 
1018
 
1019
  exported_data_list = []
@@ -1023,9 +1021,7 @@ def export_to_huggingface(repo_name_str, hf_token_for_export, progress=gr.Progre
1023
  for i, source_sample in enumerate(ds_source):
1024
  if i >= iteration_limit: break
1025
  num_processed_from_source +=1
1026
-
1027
  absolute_idx = i
1028
-
1029
  audio_entry = source_sample.get("audio")
1030
  sentence_val = source_sample.get("sentence", "")
1031
  audio_dict_to_export = audio_entry
@@ -1042,201 +1038,178 @@ def export_to_huggingface(repo_name_str, hf_token_for_export, progress=gr.Progre
1042
  approved_anns = [a for a in annotation_data.annotations if a.second_phase_review_status == "approved"]
1043
  if SECOND_PHASE and approved_anns:
1044
  best_ann = sorted(approved_anns, key=lambda x: x.second_phase_review_timestamp or datetime.min, reverse=True)[0]
1045
-
1046
  if not best_ann:
1047
  accepted_anns = [a for a in annotation_data.annotations if a.is_first_phase_accepted]
1048
  best_ann = sorted(accepted_anns, key=lambda x: x.update_at, reverse=True)[0] if accepted_anns else None
1049
-
1050
  if not best_ann:
1051
  best_ann = sorted(annotation_data.annotations, key=lambda x: x.update_at, reverse=True)[0]
1052
 
1053
  if best_ann:
1054
  sentence_val = best_ann.annotated_subtitle if best_ann.annotated_subtitle is not None else sentence_val
1055
-
1056
- if best_ann.audio_trims and audio_dict_to_export and isinstance(audio_dict_to_export, dict) and audio_dict_to_export.get("array") is not None:
1057
- # Attempt to load and use trimmed audio if path exists
1058
- original_audio_path_for_trim_lookup = get_audio_path(audio_entry) # Get original path/info
1059
  original_voice_name_for_trim = os.path.basename(str(original_audio_path_for_trim_lookup or f"sample_{absolute_idx}"))
1060
  safe_voice_name_for_trim = re.sub(r'[^\w.-]', '_', original_voice_name_for_trim)
1061
  trimmed_fname_base = f"trimmed_{absolute_idx}_{safe_voice_name_for_trim}"
1062
  potential_trimmed_path = os.path.join("trimmed_audio", trimmed_fname_base + ".wav")
1063
-
1064
  if os.path.exists(potential_trimmed_path):
1065
  try:
1066
- arr, sr = sf.read(potential_trimmed_path)
1067
- audio_dict_to_export = {"array": arr, "sampling_rate": sr}
1068
  except Exception as e_read_trim:
1069
- print(f"Warning: Could not read trimmed audio file {potential_trimmed_path} for sample {absolute_idx}: {e_read_trim}. Exporting original/untrimmed.")
1070
- else:
1071
- print(f"Warning: Trimmed audio file {potential_trimmed_path} not found for sample {absolute_idx}. Exporting original/untrimmed.")
1072
 
1073
  exported_data_list.append({
1074
  "audio": audio_dict_to_export,
1075
  "sentence": sanitize_sentence(sentence_val)
1076
  })
1077
-
1078
  if (i + 1) % 100 == 0:
1079
  progress((i + 1) / iteration_limit, f"Processed {i+1}/{iteration_limit} samples")
1080
  gc.collect()
1081
 
1082
- if num_processed_from_source != iteration_limit:
1083
- print(f"Warning: Processed {num_processed_from_source} from source, but iteration_limit was {iteration_limit}.")
1084
-
1085
- if not exported_data_list:
1086
- return "No data to export after processing."
1087
 
1088
- for item in exported_data_list:
1089
- if item["audio"] is None:
1090
- item["audio"] = {"array": np.array([], dtype=np.float32), "sampling_rate": 16000}
1091
- elif isinstance(item["audio"], dict) and 'path' in item["audio"] and item["audio"]['path'] is None:
1092
- item["audio"] = {"array": np.array([], dtype=np.float32), "sampling_rate": 16000}
1093
 
1094
  try:
1095
  final_dataset = Dataset.from_list(exported_data_list)
 
1096
  final_dataset = final_dataset.cast_column("audio", Audio(sampling_rate=16000))
1097
  except Exception as e_cast:
1098
  print(f"Error during Dataset.from_list or cast_column: {e_cast}")
1099
  for idx_problem, problematic_item in enumerate(exported_data_list[:5]):
1100
- print(f"Sample item {idx_problem} for export: Audio type {type(problematic_item['audio'])}, Audio content: {str(problematic_item['audio'])[:200]}")
1101
- return f"Export failed during data conversion: {e_cast}. Check audio data formats."
1102
 
1103
  dataset_dict_export = DatasetDict({"train": final_dataset})
1104
-
1105
  progress(0.95, "Uploading to Hugging Face...")
1106
- # Ensure repo_name_str is just the dataset name part for constructing target_repo_id
1107
- dataset_name_part = repo_name_str.split('/')[-1]
1108
- target_repo_id = f"{whoami(token=hf_token_for_export)['name']}/{dataset_name_part}"
1109
 
1110
- push_to_hub_with_retry(
1111
- dataset_dict=dataset_dict_export,
1112
- repo_id=target_repo_id,
1113
- private=True,
1114
- token_val=hf_token_for_export
1115
- )
 
 
 
1116
  end_time = time.time()
1117
  print(f"Upload done, total time: {end_time - start_time:.2f}s")
1118
  progress(1.0, "Upload complete!")
1119
  return f"Exported to huggingface.co/datasets/{target_repo_id}"
1120
-
1121
  except Exception as e:
1122
  import traceback
1123
  error_msg = f"Export failed: {str(e)}"
1124
  print(f"{error_msg}\n{traceback.format_exc()}")
1125
  return error_msg
1126
 
1127
- def hf_login(hf_token_val):
1128
- global CURRENT_USERNAME, token, current_page_data, total_samples, annotator_ranges, SECOND_PHASE_REVIEW_MAPPING
1129
 
 
 
 
 
1130
  failed_login_transcript_update = gr.update(value="", interactive=False)
1131
- # Define the full tuple structure for failure returns to ensure consistency
1132
- # page_idx, idx_on_page, audio, transcript_tb(interactive), reviewer_tb, status, original_transcript, cb_vis, cb_val
1133
- # This corresponds to navigation_outputs_extended (9 items)
1134
- # Login returns 19 items, so we need to map this carefully.
1135
 
1136
- # Outputs for login_button (19 outputs)
1137
- # login_container, main_container, reviewer_tb (val), hf_token_state, login_message,
1138
- # save_next_button(vis), transcript_tb(update), trim_button(vis), undo_trim_button(vis), delete_button(vis),
1139
- # first_phase_accept_cb(update with vis & val),
1140
- # approve_button(vis), reject_button(vis),
1141
- # current_page_idx_state(val), current_idx_on_page_state(val), audio_player(val),
1142
- # transcript_tb(another update, often same as above for value/interactive),
1143
- # status_md(val), original_transcript_state(val)
1144
-
1145
  def _failed_login_outputs(login_msg_text, reviewer_text_val="N/A"):
 
1146
  return (
1147
  gr.update(visible=True), gr.update(visible=False), # login_container, main_container
1148
- gr.update(value=reviewer_text_val), hf_token_val, login_msg_text, # reviewer_tb, hf_token_state, login_message
1149
- gr.update(visible=False), # save_next_button
1150
- failed_login_transcript_update, # transcript_tb (interactive part)
1151
- gr.update(visible=False), # trim_button
1152
- gr.update(visible=False), # undo_trim_button
1153
- gr.update(visible=False), # delete_button
1154
  gr.update(visible=False, value=False), # first_phase_accept_cb (vis & val)
1155
- gr.update(visible=False), # approve_button
1156
- gr.update(visible=False), # reject_button
1157
- 0, 0, None, # page_idx_state, idx_on_page_state, audio_player
1158
- failed_login_transcript_update, # transcript_tb (value part)
1159
  login_msg_text if "failed" in login_msg_text.lower() or "error" in login_msg_text.lower() else "Please log in.", # status_md
1160
  "" # original_transcript_state
1161
  )
1162
 
1163
- if not hf_token_val:
1164
  return _failed_login_outputs("Login failed: Token cannot be empty.")
1165
 
1166
  try:
1167
- user_info = whoami(token=hf_token_val)
 
1168
  username = user_info['name']
 
1169
 
1170
  if username in ALLOWED_USERS:
1171
  CURRENT_USERNAME = username
1172
- token = hf_token_val # Update global token if login is successful
 
1173
 
1174
- # Crucial: Re-fetch dataset info and ranges AFTER successful login,
1175
- # as these might depend on user context or be freshly needed.
1176
- ds_info = get_dataset_info() # This updates global total_samples
 
1177
  if total_samples <= 0:
1178
- return _failed_login_outputs(f"Login OK for {CURRENT_USERNAME}, but failed to get dataset size. Cannot proceed.", reviewer_text_val="Error")
1179
 
1180
- # Recalculate ranges and assignments based on potentially new total_samples
1181
  annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
1182
  if SECOND_PHASE:
1183
- # Re-initialize second phase assignments as total_samples or ANNOTATORS might have been reset/updated
1184
- SECOND_PHASE_REVIEW_MAPPING.clear() # Clear previous mapping before re-initializing
1185
- initialize_second_phase_assignments()
1186
 
1187
  user_allowed_range_check = get_user_allowed_range(CURRENT_USERNAME)
1188
  if not user_allowed_range_check or user_allowed_range_check[0] > user_allowed_range_check[1]:
1189
- return _failed_login_outputs(f"Login OK for {CURRENT_USERNAME}, but no samples assigned for {'review' if SECOND_PHASE else 'annotation'}.", reviewer_text_val="No Samples")
1190
 
1191
- # Load page 0 data for the logged-in user
1192
- current_page_data = load_page_data(0) # This sets global current_page_data and current_page
1193
 
1194
- is_second_phase_active = SECOND_PHASE
 
 
 
 
 
1195
 
1196
- # Load interface data for the first sample (page 0, index 0)
1197
- # load_interface_data returns a 9-tuple:
1198
- # (page_idx, idx_on_page, audio, transcript_update, reviewer_update, status, orig_text, cb_vis_update, cb_val)
1199
- initial_load_tuple = load_interface_data(current_page, 0 if not current_page_data.empty else 0)
1200
-
1201
 
1202
  # Structure for login_outputs (19 items)
1203
  return (
1204
- gr.update(visible=False), # login_container
1205
- gr.update(visible=True), # main_container
1206
- initial_load_tuple[4], # reviewer_tb (gr.update obj)
1207
- hf_token_val, # hf_token_state (value)
1208
- f"Login successful! Welcome {CURRENT_USERNAME}. Phase: {'Review' if is_second_phase_active else 'Annotation'}.", # login_message
1209
-
1210
- gr.update(visible=not is_second_phase_active), # save_next_button (visibility)
1211
- initial_load_tuple[3], # transcript_tb (gr.update obj for value and interactivity)
1212
- gr.update(visible=not is_second_phase_active), # trim_button (visibility)
1213
- gr.update(visible=not is_second_phase_active), # undo_trim_button (visibility)
1214
- gr.update(visible=not is_second_phase_active), # delete_button (visibility)
1215
-
1216
- gr.update(visible=initial_load_tuple[7]['visible'], value=initial_load_tuple[8]), # first_phase_accept_cb (gr.update obj for vis and val)
1217
-
1218
- gr.update(visible=is_second_phase_active), # approve_button (visibility)
1219
- gr.update(visible=is_second_phase_active), # reject_button (visibility)
1220
-
1221
- initial_load_tuple[0], # current_page_idx_state (value)
1222
- initial_load_tuple[1], # current_idx_on_page_state (value)
1223
- initial_load_tuple[2], # audio_player (value or gr.update obj)
1224
- initial_load_tuple[3], # transcript_tb (duplicate for Gradio's multiple output handling if needed, maps to same component)
1225
- initial_load_tuple[5], # status_md (value)
1226
- initial_load_tuple[6] # original_transcript_state (value)
1227
  )
1228
- else: # User not in ALLOWED_USERS
1229
  CURRENT_USERNAME = None
1230
- return _failed_login_outputs("User not authorized!", reviewer_text_val="Unauthorized")
 
1231
  except Exception as e:
1232
  CURRENT_USERNAME = None
 
1233
  import traceback
1234
  login_err_msg = f"Login failed: {str(e)}"
1235
  print(f"{login_err_msg}\n{traceback.format_exc()}")
1236
  return _failed_login_outputs(login_err_msg, reviewer_text_val="Login Error")
1237
 
1238
 
1239
- # Gradio Interface
1240
  css = """
1241
  .white { background-color: white; color: black; } .yellow { background-color: yellow; color: black; }
1242
  .blue { background-color: lightblue; color: black; } .green { background-color: lightgreen; color: black; }
@@ -1246,14 +1219,19 @@ css = """
1246
  .reviewer-textbox input { text-align: center; font-weight: bold; }
1247
  """
1248
  with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
1249
- hf_token_state = gr.State(token) # Store token if passed via env
 
 
 
 
1250
  current_page_idx_state = gr.State(0)
1251
  current_idx_on_page_state = gr.State(0)
1252
  original_transcript_state = gr.State("")
1253
 
1254
  with gr.Column(visible=True, elem_id="login_container") as login_container:
1255
  gr.Markdown("## HF Authentication")
1256
- hf_token_input = gr.Textbox(label="Hugging Face Token", type="password", value=token or "") # Pre-fill if env var set
 
1257
  login_button = gr.Button("Login")
1258
  login_message = gr.Markdown("")
1259
 
@@ -1291,30 +1269,33 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
1291
  jump_text_tb = gr.Textbox(label="Jump to Global Index", placeholder="Enter dataset absolute index")
1292
  jump_button = gr.Button("Jump")
1293
  with gr.Row():
1294
- default_repo_name = f"your-username/asr-dataset" # Will be updated on login if user context known
1295
- hf_repo_name_tb = gr.Textbox(label="Export Repository Name (username/dataset-name)", value=default_repo_name)
 
1296
  hf_export_button = gr.Button("Export to Hugging Face", variant="primary")
1297
  hf_export_status_md = gr.Markdown("")
1298
 
1299
  # Outputs for login_button (19 outputs)
1300
  login_outputs = [
1301
- login_container, main_container, reviewer_tb, hf_token_state, login_message,
1302
- save_next_button, transcript_tb, trim_button, undo_trim_button, delete_button,
1303
- first_phase_accept_cb,
1304
- approve_button, reject_button,
1305
- current_page_idx_state, current_idx_on_page_state, audio_player, transcript_tb,
1306
- status_md, original_transcript_state
 
1307
  ]
1308
  login_button.click(fn=hf_login, inputs=[hf_token_input], outputs=login_outputs)
1309
 
1310
- # Common outputs for navigation and actions that reload sample view (9 outputs)
1311
- # page_idx_state, idx_on_page_state, audio_player, transcript_tb (update), reviewer_tb (update),
1312
- # status_md, original_transcript_state, first_phase_accept_cb (update for vis), first_phase_accept_cb (update for val)
 
1313
  navigation_outputs_extended = [
1314
- current_page_idx_state, current_idx_on_page_state,
1315
- audio_player, transcript_tb, reviewer_tb, status_md, original_transcript_state,
1316
- first_phase_accept_cb, # For visibility update from get_sample via load_interface_data
1317
- first_phase_accept_cb # For value update from get_sample via load_interface_data
1318
  ]
1319
 
1320
  save_next_button.click(
@@ -1332,7 +1313,6 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
1332
  inputs=[current_page_idx_state, current_idx_on_page_state],
1333
  outputs=navigation_outputs_extended
1334
  )
1335
-
1336
  approve_button.click(
1337
  fn=review_and_next_sample_second_phase,
1338
  inputs=[current_page_idx_state, current_idx_on_page_state, gr.State("approved")],
@@ -1343,7 +1323,6 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
1343
  inputs=[current_page_idx_state, current_idx_on_page_state, gr.State("rejected")],
1344
  outputs=navigation_outputs_extended
1345
  )
1346
-
1347
  trim_button.click(
1348
  fn=trim_audio_action,
1349
  inputs=[current_page_idx_state, current_idx_on_page_state, trim_start_tb, trim_end_tb],
@@ -1357,9 +1336,8 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
1357
  delete_button.click(
1358
  fn=confirm_delete_audio_action,
1359
  inputs=[current_page_idx_state, current_idx_on_page_state],
1360
- outputs=navigation_outputs_extended # confirm_delete now returns the 9-tuple
1361
  )
1362
-
1363
  jump_button.click(
1364
  fn=jump_to_absolute_idx,
1365
  inputs=[jump_text_tb, current_page_idx_state, current_idx_on_page_state],
@@ -1367,33 +1345,32 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
1367
  )
1368
  hf_export_button.click(
1369
  fn=export_to_huggingface,
1370
- inputs=[hf_repo_name_tb, hf_token_state],
1371
  outputs=[hf_export_status_md],
1372
  queue=True
1373
  )
1374
 
1375
  if __name__ == "__main__":
1376
- # Global config overrides for testing can be placed here if needed
1377
- # Example: SECOND_PHASE = True
1378
-
1379
- # Initializations based on configuration (run once at start)
1380
- if total_samples <= 0: # If get_dataset_info() failed or wasn't run effectively before this
1381
- print("Main block: total_samples not positive. Attempting to get dataset info again.")
1382
- dataset_info = get_dataset_info() # Sets global total_samples
1383
- if total_samples > 0:
1384
- annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
1385
- else:
1386
- print("Main block: Still no total_samples. Ranges will be empty, app might not function fully.")
1387
 
 
 
 
 
 
 
 
 
 
 
 
1388
  if SECOND_PHASE:
1389
  print("==== APPLICATION LAUNCHING IN SECOND PHASE (REVIEW MODE) ====")
1390
- if not SECOND_PHASE_REVIEW_MAPPING and total_samples > 0 and ANNOTATORS:
1391
- print("Main block: Initializing second phase assignments...")
1392
- if not annotator_ranges: # Should be populated if total_samples is known
1393
- annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
1394
- initialize_second_phase_assignments()
1395
- elif not SECOND_PHASE_REVIEW_MAPPING:
1396
- print("Warning (Main block): Second phase active, but review mapping is empty. Check total_samples and ANNOTATORS list.")
1397
  else:
1398
  print("==== APPLICATION LAUNCHING IN FIRST PHASE (ANNOTATION MODE) ====")
1399
 
 
3
  import json
4
  import pandas as pd
5
  from datasets import load_dataset, DatasetDict, Dataset, Audio
6
+ from huggingface_hub import HfApi, whoami, login, hf_hub_download, HfHubHTTPError
7
  import tempfile
8
  import shutil
9
  import gc
 
17
  from pydantic import BaseModel
18
  from typing import Optional, List, Tuple
19
  from datetime import datetime
20
+ import requests
21
 
22
  # Log in with Hugging Face token
23
  token = os.getenv("hf_token")
24
  if token:
25
+ try:
26
+ login(token)
27
+ print("Successfully logged in using hf_token environment variable.")
28
+ except Exception as e:
29
+ print(f"Failed to login with hf_token environment variable: {e}")
30
+ token = None # Ensure token is None if login fails
31
  else:
32
+ print("Warning: hf_token environment variable not set. Hugging Face Hub operations might fail unless token is provided via UI.")
33
 
34
  # Configuration
35
  HF_DATASET_NAME = "navidved/channelb-raw-data"
36
+ AUDIO_DIR = "audio"
37
+ SAVE_PATH = "annotations.json" # Local filename for annotations
38
  ALLOWED_USERS = ["shahab7", "amirnamini23", "Mohsen711", "mahya2025", "najmeh00", "sepehr21ar", "zahraemarati", "moghim72", "amin76", "vargha", "navidved"]
39
  REVIEWERS = ["vargha", "navidved"]
40
  ANNOTATORS = [user for user in ALLOWED_USERS if user not in REVIEWERS]
41
  CURRENT_USERNAME = None
42
  PAGE_SIZE = 100
43
+ # SAVE_INTERVAL = 1 # FOR DEBUGGING: PUSH ON EVERY SAVE
44
+ SAVE_INTERVAL = 10 # Normal operation: push every 10 saves
45
 
46
  # --- SECOND PHASE CONFIGURATION ---
47
+ SECOND_PHASE = False
48
+ SECOND_PHASE_REVIEW_MAPPING = {}
49
 
50
  # Global state variables
51
+ current_page = 0
52
+ current_page_data = None
53
+ audio_backup = {}
54
+ annotation_count = 0 # Counts saves since login for the current session
 
55
  unsaved_changes = {}
56
+ total_samples = 0
57
+ annotator_ranges = {}
58
 
59
  # Pydantic data models
60
  class AudioTrim(BaseModel):
 
74
  update_at: datetime
75
 
76
  class Sample(BaseModel):
77
+ id: int
78
  voice_name: str
79
  original_subtitle: str
80
  ignore_it: bool = False
 
89
  def load_saved_annotations():
90
  dataset_model = None
91
  local_file_loaded_successfully = False
92
+ annotations_filename_in_repo = os.path.basename(SAVE_PATH) # e.g., "annotations.json"
93
 
 
94
  if os.path.exists(SAVE_PATH):
95
  try:
96
  with open(SAVE_PATH, "r", encoding="utf-8") as f:
97
  data = json.load(f)
98
+ if "samples" in data or not data:
99
+ dataset_model = DatasetModel(**data)
100
+ print(f"Loaded annotations from local JSON file: {SAVE_PATH}")
101
+ local_file_loaded_successfully = True
102
+ else:
103
+ print(f"Local JSON file {SAVE_PATH} has incorrect structure. Ignoring.")
 
 
104
  except Exception as e:
105
+ print(f"Error loading local JSON file '{SAVE_PATH}': {str(e)}. Will try HF Hub or create new.")
 
106
  try:
107
  corrupt_path = SAVE_PATH + ".corrupt." + datetime.now().strftime("%Y%m%d%H%M%S%f")
108
  os.rename(SAVE_PATH, corrupt_path)
109
  print(f"Renamed corrupt local file to {corrupt_path}")
110
  except OSError as re_e:
111
  print(f"Could not rename corrupt local file: {re_e}")
 
112
 
113
+ global token # Access the global token, which should be set by hf_login
 
114
  if not local_file_loaded_successfully and token:
115
+ print(f"Local annotations not loaded or not found/corrupt. Trying Hugging Face Hub for {annotations_filename_in_repo}...")
116
  try:
117
  hf_path = hf_hub_download(
118
  repo_id=HF_DATASET_NAME,
119
+ filename=annotations_filename_in_repo,
120
  repo_type="dataset",
121
  token=token
122
  )
123
  with open(hf_path, "r", encoding="utf-8") as f:
124
  data = json.load(f)
125
+ dataset_model = DatasetModel(**data)
 
126
  with open(SAVE_PATH, "w", encoding="utf-8") as f_cache:
127
  f_cache.write(dataset_model.model_dump_json(exclude_none=True, indent=4))
128
+ print(f"Loaded annotations from HF '{HF_DATASET_NAME}/{annotations_filename_in_repo}' and cached to '{SAVE_PATH}'.")
129
+ except HfHubHTTPError as e:
130
+ if e.response.status_code == 404:
131
+ print(f"Annotations file '{annotations_filename_in_repo}' not found on HF repo '{HF_DATASET_NAME}'. This is normal if it's the first run or not pushed yet.")
132
+ else:
133
+ print(f"Error loading JSON file from HF repo '{HF_DATASET_NAME}/{annotations_filename_in_repo}': {str(e)}")
134
+ except Exception as e:
135
+ print(f"Unexpected error loading JSON file from HF repo '{HF_DATASET_NAME}/{annotations_filename_in_repo}': {str(e)}")
136
 
 
137
  if dataset_model is None:
138
  print("No valid annotations found locally or on HF Hub (or failed to load). Creating new empty DatasetModel.")
139
  dataset_model = DatasetModel(samples=[])
 
140
  return dataset_model
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  def push_json_to_hf():
143
+ global token # Use the globally set token from hf_login
144
+ annotations_filename_in_repo = os.path.basename(SAVE_PATH)
145
+
146
  if not token:
147
+ print("Push to HF: Aborted. Token not available/set.")
148
+ return
149
+
150
+ print(f"Push to HF: Attempting to upload '{SAVE_PATH}' as '{annotations_filename_in_repo}' to '{HF_DATASET_NAME}'.")
151
+
152
+ try:
153
+ user_details = whoami(token=token)
154
+ print(f"Push to HF: Token confirmed for user '{user_details.get('name')}'.")
155
+ except Exception as e_whoami:
156
+ print(f"Push to HF: Token seems invalid or whoami failed. Error: {e_whoami}")
157
+ print(f"Push to HF: Aborting upload due to token validation issue.")
158
  return
159
+
160
  try:
161
  api = HfApi()
162
  api.upload_file(
163
+ path_or_fileobj=SAVE_PATH, # Local path to the file
164
+ path_in_repo=annotations_filename_in_repo, # Name of the file in the repository
165
  repo_type="dataset",
166
  repo_id=HF_DATASET_NAME,
167
+ token=token,
168
+ commit_message=f"Updated {annotations_filename_in_repo} via annotation tool at {datetime.now().isoformat()}"
169
  )
170
+ print(f"Push to HF: Successfully uploaded '{annotations_filename_in_repo}' to Hugging Face repository '{HF_DATASET_NAME}'.")
171
+ except Exception as e:
172
+ print(f"Push to HF: Error uploading '{annotations_filename_in_repo}' to '{HF_DATASET_NAME}'. Error: {str(e)}")
173
+ import traceback
174
+ print("Push to HF: Traceback below:")
175
+ traceback.print_exc()
176
+
177
+ def save_annotations(dataset_model: DatasetModel):
178
+ global annotation_count, token # Make sure we're using the global token
179
+
180
+ # DEBUGGING PRINT
181
+ print(f"Debug (save_annotations): annotation_count (before inc)={annotation_count}, SAVE_INTERVAL={SAVE_INTERVAL}, token_is_truthy={bool(token)}")
182
+
183
+ try:
184
+ with open(SAVE_PATH, "w", encoding="utf-8") as f:
185
+ f.write(dataset_model.model_dump_json(exclude_none=True, indent=4))
186
+ print(f"Saved annotations locally to {SAVE_PATH}")
187
+
188
+ annotation_count += 1 # Increment after successful local save
189
+
190
+ if token and (annotation_count % SAVE_INTERVAL == 0):
191
+ print(f"Debug (save_annotations): Conditions met for HF push. Current annotation_count={annotation_count}.")
192
+ push_json_to_hf()
193
+ elif not token:
194
+ print(f"Debug (save_annotations): HF push skipped. Token is not available. annotation_count={annotation_count}.")
195
+ else: # Token is available, but interval not met
196
+ print(f"Debug (save_annotations): HF push skipped. Interval not met. annotation_count={annotation_count}. "
197
+ f"Need {(SAVE_INTERVAL - (annotation_count % SAVE_INTERVAL)) % SAVE_INTERVAL} more saves for next push (or 0 if at interval).")
198
+
199
  except Exception as e:
200
+ print(f"Error in save_annotations (local save or triggering push): {str(e)}")
201
+ import traceback
202
+ print("Traceback for save_annotations error:")
203
+ traceback.print_exc()
204
+
205
 
206
  def calculate_annotator_ranges(total_samples_val, annotators_list):
207
  num_annotators = len(annotators_list)
 
219
  end_idx += 1
220
  if end_idx >= total_samples_val:
221
  end_idx = total_samples_val -1
222
+ if start_idx <= end_idx:
223
  ranges[annotator] = (start_idx, end_idx)
224
  start_idx = end_idx + 1
225
  print(f"Calculated annotator ranges: {ranges}")
 
244
  SECOND_PHASE_REVIEW_MAPPING[annotator] = annotator
245
  print(f"Second phase: {annotator} will review their own work.")
246
  else:
247
+ for i, reviewer_user in enumerate(ANNOTATORS):
248
  original_annotator_idx = (i - 1 + len(ANNOTATORS)) % len(ANNOTATORS)
249
  original_annotator_user = ANNOTATORS[original_annotator_idx]
250
  SECOND_PHASE_REVIEW_MAPPING[reviewer_user] = original_annotator_user
 
255
  print(f"Warning: Original annotator {original_annotator} (being reviewed by {reviewer}) has no range defined in annotator_ranges.")
256
 
257
  def get_user_allowed_range(username):
258
+ global annotator_ranges, total_samples, ANNOTATORS # Ensure ANNOTATORS is accessible
259
  if SECOND_PHASE:
260
+ if not SECOND_PHASE_REVIEW_MAPPING: # If empty, try to initialize
261
+ # Need annotator_ranges for initialize_second_phase_assignments
262
+ if not annotator_ranges and total_samples > 0 and ANNOTATORS:
263
+ annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
264
+ initialize_second_phase_assignments() # This will populate SECOND_PHASE_REVIEW_MAPPING
265
 
266
  original_annotator_to_review = SECOND_PHASE_REVIEW_MAPPING.get(username)
267
  if original_annotator_to_review:
268
+ # Ensure annotator_ranges is populated if it wasn't before
269
+ if not annotator_ranges and total_samples > 0 and ANNOTATORS:
270
  annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
 
271
  user_range = annotator_ranges.get(original_annotator_to_review)
272
  return user_range
273
+ else: # User not found in review mapping (e.g., a first-phase reviewer not part of ANNOTATORS cycle)
274
+ return None # Or handle as appropriate, e.g., full range if they are a super-reviewer
275
  else: # First Phase Logic
276
  if get_user_role(username) == "reviewer":
277
  return (0, total_samples - 1) if total_samples > 0 else None
278
+ # Ensure annotator_ranges is populated for annotators
279
+ elif not annotator_ranges and total_samples > 0 and ANNOTATORS:
280
+ annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
281
+
282
+ if username in annotator_ranges:
283
  return annotator_ranges[username]
284
  else:
285
  return None
 
297
  if total_samples > 0:
298
  return {'num_samples': total_samples}
299
  try:
300
+ print(f"Attempting to load dataset info for {HF_DATASET_NAME} (non-streaming)...")
301
  ds_info_obj = load_dataset(HF_DATASET_NAME, split="train", streaming=False)
302
  num_samples_val = ds_info_obj.num_rows
303
  if num_samples_val and num_samples_val > 0:
 
305
  print(f"Dataset info: total_samples set to {total_samples}")
306
  return {'num_samples': total_samples}
307
  else:
308
+ print(f"Warning: ds_info_obj.num_rows was not positive ({num_samples_val}). Trying iteration for count (may be slow).")
309
  ds_stream = load_dataset(HF_DATASET_NAME, split="train", streaming=True)
310
  count = 0
311
+ for _ in ds_stream: # This will iterate over the whole dataset if num_rows is wrong
312
  count +=1
313
+ if count % 10000 == 0: print(f"Counting by iteration... at {count}") # Progress for large datasets
314
  if count > 0:
315
  total_samples = count
316
  print(f"Dataset info: total_samples set to {total_samples} by iteration.")
 
320
  total_samples = -1
321
  return {'num_samples': -1}
322
  except Exception as e:
323
+ print(f"Error getting dataset info for {HF_DATASET_NAME}: {e}")
324
  total_samples = -1
325
  return {'num_samples': -1}
326
 
327
+ # Initial data load attempt (will be re-attempted more robustly in hf_login)
328
+ # dataset_info = get_dataset_info()
329
+ # if total_samples > 0:
330
+ # annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
331
+ # if SECOND_PHASE:
332
+ # initialize_second_phase_assignments()
333
+ # else:
334
+ # print("Initial check: total_samples is not positive. Will rely on login process to set this.")
335
+ # annotator_ranges = {}
336
+
337
 
338
  def get_audio_path(audio_entry):
339
  if isinstance(audio_entry, dict):
 
352
  return audio_entry
353
  return None
354
 
 
355
  def load_page_data(page_num_within_user_view=0):
356
  global current_page_data, current_page
357
 
 
358
  current_page_data = pd.DataFrame(columns=["audio", "sentence", "id_within_page", "absolute_idx"])
359
  current_page = page_num_within_user_view
360
 
 
369
  print(f"User {CURRENT_USERNAME} has an invalid allowed range: {user_allowed_range}")
370
  return current_page_data
371
 
 
372
  page_global_start_idx = user_start_abs + (page_num_within_user_view * PAGE_SIZE)
373
 
374
  if page_global_start_idx > user_end_abs:
375
  print(f"Requested page {page_num_within_user_view} (abs start {page_global_start_idx}) is beyond user {CURRENT_USERNAME}'s allowed samples end ({user_end_abs}).")
376
+ return current_page_data
377
 
 
378
  page_global_end_idx = min(page_global_start_idx + PAGE_SIZE - 1, user_end_abs)
 
 
379
  num_samples_on_this_page = page_global_end_idx - page_global_start_idx + 1
380
 
381
  if num_samples_on_this_page <= 0:
 
 
382
  print(f"No samples for user {CURRENT_USERNAME} on their page {page_num_within_user_view}. Calculated range for page: [{page_global_start_idx}-{page_global_end_idx}]")
383
+ return current_page_data
384
 
385
  print(f"Loading page {page_num_within_user_view} for user {CURRENT_USERNAME}. "
386
  f"Effective absolute dataset range for this page: [{page_global_start_idx}-{page_global_end_idx}] "
 
388
  f"Will attempt to load {num_samples_on_this_page} samples.")
389
 
390
  try:
391
+ ds_full = load_dataset(HF_DATASET_NAME, split="train", streaming=True, token=token if token else None) # Use token for private datasets
 
392
  ds_page_specific = ds_full.skip(page_global_start_idx)
 
393
  page_iterable = ds_page_specific.take(num_samples_on_this_page)
394
  except Exception as e:
395
  print(f"Error loading or processing dataset via skip/take for page data: {e}")
 
396
  return current_page_data
397
 
398
  samples_on_page_list = []
 
399
  current_processing_abs_idx = page_global_start_idx
400
 
 
401
  for id_on_page_counter, sample_data_item in enumerate(page_iterable):
402
  sample_data_item['absolute_idx'] = current_processing_abs_idx
403
+ sample_data_item['id_within_page'] = id_on_page_counter
404
  samples_on_page_list.append(sample_data_item)
 
405
  current_processing_abs_idx += 1
 
 
406
  if id_on_page_counter + 1 >= num_samples_on_this_page:
407
  break
408
 
 
412
  f"First abs_idx: {samples_on_page_list[0]['absolute_idx']}, "
413
  f"Last abs_idx: {samples_on_page_list[-1]['absolute_idx']}.")
414
  else:
 
415
  print(f"No samples were loaded for page {page_num_within_user_view} (user: {CURRENT_USERNAME}) "
416
+ f"despite expecting {num_samples_on_this_page} from range [{page_global_start_idx}-{page_global_end_idx}]. ")
 
 
417
 
418
  gc.collect()
419
  return current_page_data
 
420
 
421
+ # Core functions (save_sample_data, handle_second_phase_action, get_sample, load_interface_data, navigation functions, jump, trim, export etc. remain largely the same as your previous version)
422
+ # ... (Keep the rest of your functions from the previous version here)
423
+ # For brevity, I'm omitting the bulk of the functions that were not directly related to the HF save issue or initial loading.
424
+ # Make sure to include:
425
+ # - save_sample_data
426
+ # - handle_second_phase_action
427
+ # - get_sample
428
+ # - load_interface_data
429
+ # - navigate_sample and its wrappers
430
+ # - jump_to_absolute_idx
431
+ # - trim_audio_action, undo_trim_action, confirm_delete_audio_action
432
+ # - export_to_huggingface
433
+ # - hf_login (ensure it correctly calls get_dataset_info, calculate_annotator_ranges, load_page_data, etc. *after* successful auth)
434
+
435
  def save_sample_data(page_idx, idx_on_page, transcript, current_user_performing_action, accepted_flag=False):
436
  global current_page_data, unsaved_changes
437
 
 
449
  audio_entry_original = actual_sample_info["audio"]
450
  voice_name = os.path.basename(str(get_audio_path(audio_entry_original) or f"sample_{absolute_idx}"))
451
 
452
+ dataset_model = load_saved_annotations() # This will load existing or create new
453
  sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
454
 
455
  if not sample:
 
500
  if absolute_idx in unsaved_changes:
501
  del unsaved_changes[absolute_idx]
502
 
503
+ save_annotations(dataset_model) # This will save locally and potentially push to HF
504
  return f"✓ Saved annotation for sample {absolute_idx}"
505
 
506
  def handle_second_phase_action(page_idx, idx_on_page, action: str):
 
529
  print(f"Warning: No prior annotation by {original_annotator_to_review} for sample {absolute_idx}. Creating placeholder for review.")
530
  annotation_to_review = Annotation(
531
  annotator=original_annotator_to_review,
532
+ annotated_subtitle=sample.original_subtitle, # Or actual_sample_info["sentence"]
533
  create_at=datetime.now(),
534
  update_at=datetime.now()
535
  )
 
543
 
544
  if action == "approved":
545
  sample.is_approved_in_second_phase = True
546
+ # else: sample.is_approved_in_second_phase = False # Explicitly set to False on rejection
 
547
 
548
  save_annotations(dataset_model)
549
  return f"✓ Review ({action}) saved for sample {absolute_idx} (Original annotator: {original_annotator_to_review})"
 
552
  global current_page_data, total_samples
553
 
554
  if current_page_data is None or idx_on_page < 0 or idx_on_page >= len(current_page_data):
555
+ # Default empty values for all expected return items
556
  return None, "", f"Invalid index ({idx_on_page}) for current page data (len {len(current_page_data) if current_page_data is not None else 'None'}).", "unreviewed", "white", True, False, "", gr.update(visible=False)
557
 
558
  actual_sample_info = current_page_data.iloc[idx_on_page]
 
561
  audio_entry_original = actual_sample_info["audio"]
562
  audio_val = get_audio_path(audio_entry_original)
563
 
564
+ default_transcript = actual_sample_info.get("sentence", "") # Use .get for safety
565
  transcript_to_display = default_transcript
566
 
567
  ui_reviewer_field = "unreviewed"
568
  ui_color = "white"
569
  ui_editable = True
570
+ ui_is_accepted_flag = False
571
 
572
  status_prefix = ""
573
  user_allowed_range = get_user_allowed_range(current_user_displaying)
574
  if user_allowed_range:
575
  user_start_abs, user_end_abs = user_allowed_range
576
+ # Ensure user_start_abs is valid before calculation
577
+ if user_start_abs is not None and absolute_idx >= user_start_abs :
578
+ current_sample_num_in_user_assignment = absolute_idx - user_start_abs + 1
579
+ total_samples_for_user = user_end_abs - user_start_abs + 1
580
+ status_prefix = f"Sample {current_sample_num_in_user_assignment} of {total_samples_for_user} for you (Abs Idx {absolute_idx})."
581
+ else: # Fallback if range is odd or absolute_idx is somehow outside
582
+ status_prefix = f"Sample (Abs Idx {absolute_idx}). Range issue for user stats."
583
+
584
  else:
585
+ status_prefix = f"Sample (Abs Idx {absolute_idx}). No range assigned."
586
 
587
  dataset_model = load_saved_annotations()
588
  sample_from_json = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
 
609
 
610
  if annotation_under_review:
611
  transcript_to_display = annotation_under_review.annotated_subtitle or default_transcript
 
612
  ui_is_accepted_flag = (annotation_under_review.second_phase_review_status == "approved" and
613
  annotation_under_review.second_phase_reviewed_by == current_user_displaying)
614
 
 
619
  ui_color = "gray"
620
  ui_reviewer_field += f" (Already reviewed by {annotation_under_review.second_phase_reviewed_by} as {annotation_under_review.second_phase_review_status})"
621
  else:
622
+ ui_color = "yellow"
623
  else:
624
  transcript_to_display = default_transcript
625
  ui_reviewer_field += " (No submission by original annotator)"
 
632
  transcript_to_display = accepted_first_phase_annotation.annotated_subtitle or default_transcript
633
  ui_reviewer_field = f"Accepted by: {accepted_first_phase_annotation.first_phase_reviewer_username}"
634
  ui_color = "green"
635
+ ui_is_accepted_flag = True
636
  ui_editable = (get_user_role(current_user_displaying) == "reviewer")
637
  else:
638
  user_specific_annotation = next((a for a in sample_from_json.annotations or [] if a.annotator == current_user_displaying), None)
 
641
  ui_reviewer_field = f"Your draft (as {user_specific_annotation.annotator})"
642
  ui_color = "yellow"
643
  ui_editable = True
644
+ else:
 
645
  other_annotations = [a for a in sample_from_json.annotations or [] if not a.is_first_phase_accepted]
646
  if other_annotations:
647
  if get_user_role(current_user_displaying) == "reviewer":
648
+ other_ann_to_show = other_annotations[0]
649
  transcript_to_display = other_ann_to_show.annotated_subtitle or default_transcript
650
  ui_reviewer_field = f"Draft by: {other_ann_to_show.annotator}"
651
  ui_color = "blue"
652
  ui_editable = True
653
+ else:
 
654
  transcript_to_display = default_transcript
655
  ui_reviewer_field = f"Labeled by: {other_annotations[0].annotator}"
656
  ui_color = "lightblue"
657
+ ui_editable = False
 
658
 
 
659
  if not SECOND_PHASE and absolute_idx in unsaved_changes:
660
  ui_color = "pink"
661
 
 
665
  else:
666
  ui_status_message += " (Annotation Phase)"
667
 
 
668
  show_accept_checkbox = not SECOND_PHASE and get_user_role(current_user_displaying) == "reviewer"
669
 
670
  return audio_val, transcript_to_display, ui_status_message, ui_reviewer_field, ui_color, ui_editable, ui_is_accepted_flag, default_transcript, gr.update(visible=show_accept_checkbox)
671
 
672
  def load_interface_data(page_idx_user_relative, idx_on_page):
673
+ # get_sample returns 9 items
674
+ audio, text, base_status, saved_reviewer_text, color, editable, accepted_flag, original_dataset_text, accept_cb_visibility_update = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
675
 
676
  return (
677
+ page_idx_user_relative, # 0
678
+ idx_on_page, # 1
679
+ audio, # 2
680
+ gr.update(value=text, interactive=editable), # 3 transcript_tb
681
+ gr.update(value=saved_reviewer_text, elem_classes=[color]), # 4 reviewer_tb
682
+ base_status, # 5 status_md
683
+ original_dataset_text, # 6 original_transcript_state
684
+ accept_cb_visibility_update, # 7 first_phase_accept_cb (visibility part)
685
+ accepted_flag # 8 first_phase_accept_cb (value part)
686
  )
687
 
688
  def navigate_sample(page_idx_user_relative, idx_on_page, direction: int):
 
691
  if current_page_data is None or len(current_page_data) == 0:
692
  user_allowed_range = get_user_allowed_range(CURRENT_USERNAME)
693
  err_msg = "No data loaded. Try reloading or check your assigned range."
694
+ if not user_allowed_range or (user_allowed_range[0] > user_allowed_range[1]): # check for invalid range
695
  err_msg = "You have no samples assigned or your range is invalid."
696
 
697
+ # Return a 9-tuple consistent with load_interface_data's structure
698
  return page_idx_user_relative, idx_on_page, None, gr.update(value="Error", interactive=False), gr.update(value="Error"), err_msg, "", gr.update(visible=False), False
699
 
700
 
 
703
  new_idx_on_page = target_idx_on_page
704
 
705
  user_allowed_range = get_user_allowed_range(CURRENT_USERNAME)
706
+ # This check should ideally not be hit if current_page_data exists, but good safeguard
707
  if not user_allowed_range:
708
+ # Use get_sample to fetch current state with an error message
709
+ current_state = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
710
+ # current_state is a 9-tuple: (audio, text, status, rev, color, edit, acc_flag, orig_text, cb_vis_update)
711
+ return page_idx_user_relative, idx_on_page, current_state[0], gr.update(value=current_state[1], interactive=current_state[5]), gr.update(value=current_state[3], elem_classes=[current_state[4]]), "Error: No allowed range for navigation.", current_state[7], current_state[8], current_state[6]
712
 
713
 
714
+ if target_idx_on_page < 0: # Moving to previous page or beginning of assignment
715
  if page_idx_user_relative > 0:
716
  new_page_idx_user_relative = page_idx_user_relative - 1
717
  temp_data = load_page_data(new_page_idx_user_relative)
718
  if temp_data is not None and not temp_data.empty:
719
  new_idx_on_page = len(temp_data) - 1
720
+ else: # Previous page is empty (shouldn't happen if logic is correct)
721
+ current_state = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
722
+ status = current_state[2] + " [Already at the first sample of this page/range]"
723
+ return page_idx_user_relative, idx_on_page, current_state[0], gr.update(value=current_state[1], interactive=current_state[5]), gr.update(value=current_state[3], elem_classes=[current_state[4]]), status, current_state[7], current_state[8], current_state[6]
724
+ else: # Already on first item of first user-relative page
725
+ current_state = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
726
+ status = current_state[2] + " [At the beginning of your assigned samples]"
727
+ return page_idx_user_relative, idx_on_page, current_state[0], gr.update(value=current_state[1], interactive=current_state[5]), gr.update(value=current_state[3], elem_classes=[current_state[4]]), status, current_state[7], current_state[8], current_state[6]
728
+
729
+ elif target_idx_on_page >= len(current_page_data): # Moving to next page or end of assignment
730
  new_page_idx_user_relative = page_idx_user_relative + 1
731
  temp_data = load_page_data(new_page_idx_user_relative)
732
  if temp_data is not None and not temp_data.empty:
733
  new_idx_on_page = 0
734
+ else: # Next user-relative page is empty (means we are at the end of user's allowed samples)
735
  current_abs_idx_check = -1
736
  if current_page_data is not None and not current_page_data.empty and idx_on_page < len(current_page_data):
737
  current_abs_idx_check = current_page_data.iloc[idx_on_page]['absolute_idx']
738
 
739
  is_at_very_end = user_allowed_range and current_abs_idx_check != -1 and current_abs_idx_check >= user_allowed_range[1]
740
 
741
+ current_state = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
742
+ status = current_state[2]
743
  if is_at_very_end:
744
+ status += " [At the end of your assigned samples]"
745
  else:
746
+ status += " [No more samples in this direction (next page empty or end of assignment)]"
747
+ return page_idx_user_relative, idx_on_page, current_state[0], gr.update(value=current_state[1], interactive=current_state[5]), gr.update(value=current_state[3], elem_classes=[current_state[4]]), status, current_state[7], current_state[8], current_state[6]
748
 
749
+ # If navigation is within the current page or to a new valid page/index
750
  return load_interface_data(new_page_idx_user_relative, new_idx_on_page)
751
 
752
  def go_next_sample_wrapper(page_idx_user_relative, idx_on_page):
 
771
 
772
  def jump_to_absolute_idx(target_abs_idx_str, current_page_idx_user_relative, current_idx_on_page):
773
  global current_page_data
774
+ # Fallback return using current state if jump fails
775
+ def _fallback_return(status_message_suffix=""):
776
+ current_state = get_sample(current_page_idx_user_relative, current_idx_on_page, CURRENT_USERNAME)
777
+ status = current_state[2] + status_message_suffix
778
+ return current_page_idx_user_relative, current_idx_on_page, current_state[0], gr.update(value=current_state[1], interactive=current_state[5]), gr.update(value=current_state[3], elem_classes=[current_state[4]]), status, current_state[7], current_state[8], current_state[6]
779
 
780
  try:
781
  target_abs_idx = int(target_abs_idx_str)
 
783
 
784
  user_allowed_range = get_user_allowed_range(CURRENT_USERNAME)
785
  if not user_allowed_range or not is_within_range(target_abs_idx, user_allowed_range):
786
+ return _fallback_return(f" [Target index {target_abs_idx} is outside your assigned range {user_allowed_range or 'N/A'}.]")
 
 
787
 
788
  user_start_abs, _ = user_allowed_range
789
  offset_from_user_start = target_abs_idx - user_start_abs
790
 
791
  if offset_from_user_start < 0:
792
+ return _fallback_return(f" [Logic Error: Target index {target_abs_idx} has negative offset from user start {user_start_abs}.]")
 
 
 
793
 
794
  new_user_relative_page_idx = offset_from_user_start // PAGE_SIZE
795
+ # load_page_data updates global current_page_data and current_page
796
  temp_page_data_df = load_page_data(new_user_relative_page_idx)
797
 
798
  if temp_page_data_df is None or temp_page_data_df.empty:
799
+ return _fallback_return(f" [No data found for your page {new_user_relative_page_idx} (containing abs index {target_abs_idx})].")
800
+
801
+ # Calculate new_idx_on_page based on the target_abs_idx relative to the start of the loaded page
802
+ # The loaded page (current_page_data) now starts at `user_start_abs + new_user_relative_page_idx * PAGE_SIZE`
803
+ page_actual_start_abs = current_page_data.iloc[0]['absolute_idx'] if not current_page_data.empty else -1
804
+
805
+ if page_actual_start_abs == -1: # Should not happen if temp_page_data_df was not empty
806
+ return _fallback_return(f" [Error: Page {new_user_relative_page_idx} loaded empty unexpectedly.]")
807
+
808
+ new_idx_on_page_actual = target_abs_idx - page_actual_start_abs
809
+
810
+ if not (0 <= new_idx_on_page_actual < len(current_page_data)):
811
+ # This means target_abs_idx was in the user's range for this page, but the page didn't actually contain it
812
+ # (e.g. dataset ended prematurely within this page's expected span)
813
+ # Default to first item on the successfully loaded (but perhaps shorter) page.
814
+ print(f"Warning: Target index {target_abs_idx} resulted in out-of-bounds id_on_page ({new_idx_on_page_actual}) for loaded page. Defaulting to 0.")
815
+ new_idx_on_page_actual = 0
816
+ if current_page_data.empty: # Should be caught above
817
+ return _fallback_return(f" [Page {new_user_relative_page_idx} is empty after load attempt for jump.]")
 
 
818
 
819
  return load_interface_data(new_user_relative_page_idx, new_idx_on_page_actual)
820
 
821
  except ValueError:
822
+ return _fallback_return(" [Invalid index format for jump.]")
823
  except Exception as e:
824
  import traceback
825
+ print(f"Error jumping to index: {str(e)}\n{traceback.format_exc()}")
826
+ return _fallback_return(f" [Error jumping to index: {str(e)}]")
827
 
 
 
 
828
 
829
  def trim_audio_action(page_idx_user_relative, idx_on_page, trim_start_str, trim_end_str):
 
830
  def _return_current_state_with_message(msg_suffix):
831
+ loaded_data = load_interface_data(page_idx_user_relative, idx_on_page)
832
+ return (*loaded_data[0:5], loaded_data[5] + f" [{msg_suffix}]", *loaded_data[6:])
833
 
834
  if SECOND_PHASE: return _return_current_state_with_message("Trimming disabled in Review Phase.")
835
 
 
838
 
839
  actual_sample_info = current_page_data.iloc[idx_on_page]
840
  absolute_idx = actual_sample_info['absolute_idx']
 
841
  original_audio_path_info = get_audio_path(actual_sample_info["audio"])
842
  source_basename_for_trimmed_file = os.path.basename(str(original_audio_path_info)) if isinstance(original_audio_path_info, str) else f"sample_raw_data_{absolute_idx}"
 
843
  audio_seg = None
844
  temp_dir_for_download = None
845
 
846
  try:
847
+ if isinstance(original_audio_path_info, tuple):
848
  sr, audio_array = original_audio_path_info
849
  if not isinstance(audio_array, np.ndarray): return _return_current_state_with_message("Raw audio data is not a numpy array.")
850
  if audio_array.size == 0: return _return_current_state_with_message("Cannot trim empty audio array.")
 
 
851
  audio_array = np.ascontiguousarray(audio_array)
852
  channels = 1 if audio_array.ndim == 1 else (audio_array.shape[1] if audio_array.ndim == 2 and audio_array.shape[1] in [1,2] else (audio_array.shape[0] if audio_array.ndim == 2 and audio_array.shape[0] in [1,2] else 0))
853
  if channels == 0: return _return_current_state_with_message(f"Unsupported audio array shape or channels: {audio_array.shape}")
854
+ if audio_array.ndim == 2 and audio_array.shape[0] < audio_array.shape[1] and audio_array.shape[0] in [1, 2]: audio_array = np.ascontiguousarray(audio_array.T)
 
 
 
855
  if audio_array.dtype == np.float32 or audio_array.dtype == np.float64: audio_array_int = (audio_array * np.iinfo(np.int16).max).astype(np.int16)
856
  elif audio_array.dtype == np.int16: audio_array_int = audio_array
857
+ elif audio_array.dtype == np.int32: audio_array_int = (audio_array >> 16).astype(np.int16)
858
  else: return _return_current_state_with_message(f"Unsupported numpy array dtype for raw audio: {audio_array.dtype}")
 
859
  sample_width = audio_array_int.itemsize
860
  audio_seg = AudioSegment(data=audio_array_int.tobytes(), sample_width=sample_width, frame_rate=sr, channels=channels)
861
+ elif isinstance(original_audio_path_info, str):
 
862
  audio_to_load = original_audio_path_info
863
  if not (os.path.exists(audio_to_load) or audio_to_load.startswith("http")): return _return_current_state_with_message("Audio file path is invalid, does not exist, or is not a valid URL.")
864
  if audio_to_load.startswith("http"):
 
871
  audio_seg = AudioSegment.from_file(audio_to_load)
872
  else:
873
  return _return_current_state_with_message("Trimming not supported for this audio source.")
 
874
  if audio_seg is None: return _return_current_state_with_message("Failed to load audio segment.")
 
875
  try: start_s, end_s = float(trim_start_str), float(trim_end_str)
876
  except ValueError: return _return_current_state_with_message("Invalid trim times: Start and End must be numbers.")
877
  start_ms, end_ms, audio_duration_ms = int(start_s * 1000), int(end_s * 1000), len(audio_seg)
878
  if not (0 <= start_ms < end_ms and end_ms <= audio_duration_ms):
879
+ return _return_current_state_with_message(f"Invalid trim times: start={start_s}s, end={end_s}s for audio of {audio_duration_ms/1000.0:.2f}s.")
 
880
  trimmed_seg = audio_seg[start_ms:end_ms]
881
  os.makedirs("trimmed_audio", exist_ok=True)
882
  safe_voice_name = re.sub(r'[^\w.-]', '_', source_basename_for_trimmed_file)
883
  trimmed_filename = f"trimmed_{absolute_idx}_{safe_voice_name}"
884
+ if not os.path.splitext(trimmed_filename)[1]: trimmed_filename += ".wav"
885
  trimmed_path = os.path.join("trimmed_audio", trimmed_filename)
886
  export_format = os.path.splitext(trimmed_path)[1][1:].lower() or "wav"
887
  trimmed_seg.export(trimmed_path, format=export_format)
 
888
  dataset_model = load_saved_annotations()
889
  sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
 
890
  if not sample:
891
  sample = Sample(id=absolute_idx, voice_name=os.path.basename(str(get_audio_path(actual_sample_info["audio"]) or f"sample_{absolute_idx}")),
892
  original_subtitle=actual_sample_info["sentence"], annotations=[])
893
  dataset_model.samples = dataset_model.samples or []
894
  dataset_model.samples.append(sample)
 
895
  now = datetime.now()
896
  annotation = next((a for a in sample.annotations or [] if a.annotator == CURRENT_USERNAME), None)
897
  if not annotation:
898
  annotation = Annotation(annotator=CURRENT_USERNAME, create_at=now, update_at=now)
899
  sample.annotations = sample.annotations or []
900
  sample.annotations.append(annotation)
 
901
  annotation.audio_trims = [AudioTrim(start=start_s, end=end_s)]
902
  annotation.update_at = now
903
  save_annotations(dataset_model)
904
 
905
  # Return full state, but with new audio path and status message
906
+ loaded_data_after_trim = load_interface_data(page_idx_user_relative, idx_on_page)
907
+ # The audio path needs to be overridden here to show the trimmed path
908
+ return (loaded_data_after_trim[0], loaded_data_after_trim[1], trimmed_path,
909
+ loaded_data_after_trim[3], loaded_data_after_trim[4],
910
+ loaded_data_after_trim[5] + " [Trimmed]",
911
+ *loaded_data_after_trim[6:])
912
  except Exception as e:
913
  import traceback
914
  print(f"Error during trim_audio_action for abs_idx {absolute_idx}: {str(e)}\n{traceback.format_exc()}")
 
919
 
920
  def undo_trim_action(page_idx_user_relative, idx_on_page):
921
  def _return_current_state_with_message(msg_suffix):
922
+ return load_interface_data(page_idx_user_relative, idx_on_page)[0:5] + \
923
+ (load_interface_data(page_idx_user_relative, idx_on_page)[5] + f" [{msg_suffix}]",) + \
924
+ load_interface_data(page_idx_user_relative, idx_on_page)[6:]
925
 
926
  if SECOND_PHASE: return _return_current_state_with_message("Undo Trim disabled in Review Phase.")
 
927
  if current_page_data is None or idx_on_page < 0 or idx_on_page >= len(current_page_data):
928
  return _return_current_state_with_message("Audio data not available (page error).")
929
 
 
936
  annotation.audio_trims = None
937
  annotation.update_at = datetime.now()
938
  save_annotations(dataset_model)
939
+ return _return_current_state_with_message("Trim undone") # Reloads UI showing original audio
 
 
 
940
 
941
  def confirm_delete_audio_action(page_idx_user_relative, idx_on_page):
942
+ def _return_current_state_with_message(msg_suffix=""): # Default to no suffix if just reloading
 
 
 
 
 
943
  loaded_data = load_interface_data(page_idx_user_relative, idx_on_page)
944
+ return (*loaded_data[0:5], loaded_data[5] + f" [{msg_suffix}]" if msg_suffix else loaded_data[5], *loaded_data[6:])
 
945
 
946
+ if SECOND_PHASE:
947
+ return _return_current_state_with_message("Delete disabled in Review Phase.")
948
  if current_page_data is None or idx_on_page < 0 or idx_on_page >= len(current_page_data):
949
+ return _return_current_state_with_message("Audio data not available (page error).")
 
 
950
 
951
  absolute_idx = current_page_data.iloc[idx_on_page]['absolute_idx']
952
  voice_name_original = os.path.basename(str(get_audio_path(current_page_data.iloc[idx_on_page]["audio"]) or f"sample_{absolute_idx}"))
 
953
  dataset_model = load_saved_annotations()
954
  sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
955
  if not sample:
956
+ sample = Sample(id=absolute_idx, voice_name=voice_name_original,
957
+ original_subtitle=current_page_data.iloc[idx_on_page]["sentence"], annotations=[])
 
 
 
 
958
  dataset_model.samples = dataset_model.samples or []
959
  dataset_model.samples.append(sample)
 
960
  sample.ignore_it = True
961
  now = datetime.now()
962
  deleted_text_marker = "AUDIO DELETED (This audio has been removed.)"
 
970
  sample.annotations = sample.annotations or []
971
  sample.annotations.append(annotation)
972
  save_annotations(dataset_model)
973
+ return _return_current_state_with_message() # Reload interface to show deleted status
974
 
 
 
 
 
 
975
  def sanitize_string(s):
976
  if not isinstance(s, str): s = str(s)
977
  return re.sub(r'[^\w-./]', '_', s)
 
986
  print("Cannot push to hub: No token provided for push_to_hub_with_retry.")
987
  return
988
  print(f"Pushing dataset to {repo_id}")
989
+ dataset_dict.push_to_hub(repo_id, private=private, token=token_val) # Make sure this token has write access
990
 
991
  def export_to_huggingface(repo_name_str, hf_token_for_export, progress=gr.Progress()):
992
  if not hf_token_for_export:
 
1000
 
1001
  dataset_model_annotations = load_saved_annotations()
1002
 
 
1003
  current_total_samples = total_samples
1004
  if current_total_samples <= 0:
1005
+ info = get_dataset_info()
1006
  current_total_samples = total_samples
1007
  if current_total_samples <= 0:
1008
  return "Export failed: Total number of samples is unknown or invalid."
1009
 
1010
+ ds_source = load_dataset(HF_DATASET_NAME, split="train", streaming=False, token=hf_token_for_export) # Use token for private source
1011
 
1012
+ iteration_limit = len(ds_source)
1013
+ if iteration_limit != current_total_samples:
1014
+ print(f"Warning: Source dataset length ({iteration_limit}) mismatches cached total_samples ({current_total_samples}). Using source length for export.")
 
 
 
 
 
 
 
 
1015
 
1016
 
1017
  exported_data_list = []
 
1021
  for i, source_sample in enumerate(ds_source):
1022
  if i >= iteration_limit: break
1023
  num_processed_from_source +=1
 
1024
  absolute_idx = i
 
1025
  audio_entry = source_sample.get("audio")
1026
  sentence_val = source_sample.get("sentence", "")
1027
  audio_dict_to_export = audio_entry
 
1038
  approved_anns = [a for a in annotation_data.annotations if a.second_phase_review_status == "approved"]
1039
  if SECOND_PHASE and approved_anns:
1040
  best_ann = sorted(approved_anns, key=lambda x: x.second_phase_review_timestamp or datetime.min, reverse=True)[0]
 
1041
  if not best_ann:
1042
  accepted_anns = [a for a in annotation_data.annotations if a.is_first_phase_accepted]
1043
  best_ann = sorted(accepted_anns, key=lambda x: x.update_at, reverse=True)[0] if accepted_anns else None
 
1044
  if not best_ann:
1045
  best_ann = sorted(annotation_data.annotations, key=lambda x: x.update_at, reverse=True)[0]
1046
 
1047
  if best_ann:
1048
  sentence_val = best_ann.annotated_subtitle if best_ann.annotated_subtitle is not None else sentence_val
1049
+ if best_ann.audio_trims and audio_dict_to_export:
1050
+ original_audio_path_for_trim_lookup = get_audio_path(audio_entry)
 
 
1051
  original_voice_name_for_trim = os.path.basename(str(original_audio_path_for_trim_lookup or f"sample_{absolute_idx}"))
1052
  safe_voice_name_for_trim = re.sub(r'[^\w.-]', '_', original_voice_name_for_trim)
1053
  trimmed_fname_base = f"trimmed_{absolute_idx}_{safe_voice_name_for_trim}"
1054
  potential_trimmed_path = os.path.join("trimmed_audio", trimmed_fname_base + ".wav")
 
1055
  if os.path.exists(potential_trimmed_path):
1056
  try:
1057
+ arr, sr_trim = sf.read(potential_trimmed_path) # Renamed sr to sr_trim
1058
+ audio_dict_to_export = {"array": arr, "sampling_rate": sr_trim}
1059
  except Exception as e_read_trim:
1060
+ print(f"Warning: Could not read trimmed audio file {potential_trimmed_path} for sample {absolute_idx}: {e_read_trim}.")
1061
+ # else: # Keep original audio_dict_to_export
 
1062
 
1063
  exported_data_list.append({
1064
  "audio": audio_dict_to_export,
1065
  "sentence": sanitize_sentence(sentence_val)
1066
  })
 
1067
  if (i + 1) % 100 == 0:
1068
  progress((i + 1) / iteration_limit, f"Processed {i+1}/{iteration_limit} samples")
1069
  gc.collect()
1070
 
1071
+ if not exported_data_list: return "No data to export after processing."
 
 
 
 
1072
 
1073
+ for item in exported_data_list: # Ensure audio format before creating Dataset
1074
+ audio_item = item["audio"]
1075
+ if audio_item is None or (isinstance(audio_item, dict) and audio_item.get('path') is None and audio_item.get('array') is None):
1076
+ item["audio"] = {"array": np.array([], dtype=np.float32), "sampling_rate": 16000} # Placeholder for missing/deleted
 
1077
 
1078
  try:
1079
  final_dataset = Dataset.from_list(exported_data_list)
1080
+ # Cast audio, ensure all items have 'array' and 'sampling_rate' or valid 'path'
1081
  final_dataset = final_dataset.cast_column("audio", Audio(sampling_rate=16000))
1082
  except Exception as e_cast:
1083
  print(f"Error during Dataset.from_list or cast_column: {e_cast}")
1084
  for idx_problem, problematic_item in enumerate(exported_data_list[:5]):
1085
+ print(f"Sample item {idx_problem} for export: Audio type {type(problematic_item['audio'])}, Content: {str(problematic_item['audio'])[:200]}")
1086
+ return f"Export failed during data conversion: {e_cast}."
1087
 
1088
  dataset_dict_export = DatasetDict({"train": final_dataset})
 
1089
  progress(0.95, "Uploading to Hugging Face...")
 
 
 
1090
 
1091
+ try:
1092
+ current_hf_user = whoami(token=hf_token_for_export)['name']
1093
+ except Exception as e_whoami_export:
1094
+ return f"Export failed: Could not verify Hugging Face user with provided token: {e_whoami_export}"
1095
+
1096
+ dataset_name_part = repo_name_str.split('/')[-1] # Get 'my-annotated-dataset' from 'user/my-annotated-dataset'
1097
+ target_repo_id = f"{current_hf_user}/{dataset_name_part}"
1098
+
1099
+ push_to_hub_with_retry(dataset_dict=dataset_dict_export, repo_id=target_repo_id, private=True, token_val=hf_token_for_export)
1100
  end_time = time.time()
1101
  print(f"Upload done, total time: {end_time - start_time:.2f}s")
1102
  progress(1.0, "Upload complete!")
1103
  return f"Exported to huggingface.co/datasets/{target_repo_id}"
 
1104
  except Exception as e:
1105
  import traceback
1106
  error_msg = f"Export failed: {str(e)}"
1107
  print(f"{error_msg}\n{traceback.format_exc()}")
1108
  return error_msg
1109
 
1110
+ def hf_login(hf_token_val_ui):
1111
+ global CURRENT_USERNAME, token, current_page_data, total_samples, annotator_ranges, SECOND_PHASE_REVIEW_MAPPING, annotation_count
1112
 
1113
+ # Reset session-specific annotation count on new login
1114
+ annotation_count = 0
1115
+
1116
+ # Default state for UI elements on login failure or before successful load
1117
  failed_login_transcript_update = gr.update(value="", interactive=False)
 
 
 
 
1118
 
 
 
 
 
 
 
 
 
 
1119
  def _failed_login_outputs(login_msg_text, reviewer_text_val="N/A"):
1120
+ # This function constructs the 19-tuple for login outputs
1121
  return (
1122
  gr.update(visible=True), gr.update(visible=False), # login_container, main_container
1123
+ gr.update(value=reviewer_text_val), hf_token_val_ui, login_msg_text, # reviewer_tb, hf_token_state, login_message
1124
+ gr.update(visible=False), failed_login_transcript_update, # save_next_button, transcript_tb (interactive)
1125
+ gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), # trim, undo_trim, delete buttons
 
 
 
1126
  gr.update(visible=False, value=False), # first_phase_accept_cb (vis & val)
1127
+ gr.update(visible=False), gr.update(visible=False), # approve_button, reject_button
1128
+ 0, 0, None, failed_login_transcript_update, # page_idx, idx_on_page, audio, transcript_tb (value)
 
 
1129
  login_msg_text if "failed" in login_msg_text.lower() or "error" in login_msg_text.lower() else "Please log in.", # status_md
1130
  "" # original_transcript_state
1131
  )
1132
 
1133
+ if not hf_token_val_ui:
1134
  return _failed_login_outputs("Login failed: Token cannot be empty.")
1135
 
1136
  try:
1137
+ print(f"Attempting login with token from UI...")
1138
+ user_info = whoami(token=hf_token_val_ui)
1139
  username = user_info['name']
1140
+ print(f"whoami successful for user: {username}")
1141
 
1142
  if username in ALLOWED_USERS:
1143
  CURRENT_USERNAME = username
1144
+ token = hf_token_val_ui # IMPORTANT: Set the global token to the one provided in UI
1145
+ print(f"User '{CURRENT_USERNAME}' is in ALLOWED_USERS. Global token updated.")
1146
 
1147
+ # Crucial: Fetch dataset info and ranges AFTER successful login & token set
1148
+ # Reset total_samples to ensure it's re-fetched with the new token if necessary
1149
+ total_samples = 0
1150
+ ds_info = get_dataset_info()
1151
  if total_samples <= 0:
1152
+ return _failed_login_outputs(f"Login OK for {CURRENT_USERNAME}, but failed to get dataset size. Cannot proceed.", reviewer_text_val="Error: No Dataset Size")
1153
 
 
1154
  annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
1155
  if SECOND_PHASE:
1156
+ # SECOND_PHASE_REVIEW_MAPPING.clear() # Clear previous mapping
1157
+ initialize_second_phase_assignments() # This uses global annotator_ranges
 
1158
 
1159
  user_allowed_range_check = get_user_allowed_range(CURRENT_USERNAME)
1160
  if not user_allowed_range_check or user_allowed_range_check[0] > user_allowed_range_check[1]:
1161
+ return _failed_login_outputs(f"Login OK for {CURRENT_USERNAME}, but no samples assigned for {'review' if SECOND_PHASE else 'annotation'}.", reviewer_text_val="No Samples Assigned")
1162
 
1163
+ current_page_data = load_page_data(0) # page_num_within_user_view = 0
 
1164
 
1165
+ # Check if page loading actually got data
1166
+ initial_idx_on_page = 0
1167
+ if current_page_data is None or current_page_data.empty:
1168
+ print(f"Warning: Initial page load for user {CURRENT_USERNAME} resulted in no data.")
1169
+ # Attempt to load interface with (0,0) but expect "no data" messages from get_sample
1170
+ initial_idx_on_page = 0 # or handle as error if no data at all is critical
1171
 
1172
+ # load_interface_data returns a 9-tuple
1173
+ initial_load_tuple = load_interface_data(current_page, initial_idx_on_page)
1174
+
1175
+ is_second_phase_active = SECOND_PHASE
 
1176
 
1177
  # Structure for login_outputs (19 items)
1178
  return (
1179
+ gr.update(visible=False), # 0 login_container
1180
+ gr.update(visible=True), # 1 main_container
1181
+ initial_load_tuple[4], # 2 reviewer_tb (gr.update obj from load_interface_data)
1182
+ hf_token_val_ui, # 3 hf_token_state (value) -> updates the gr.State
1183
+ f"Login successful! Welcome {CURRENT_USERNAME}. Phase: {'Review' if is_second_phase_active else 'Annotation'}.", # 4 login_message
1184
+ gr.update(visible=not is_second_phase_active), # 5 save_next_button (visibility)
1185
+ initial_load_tuple[3], # 6 transcript_tb (gr.update obj for value and interactivity)
1186
+ gr.update(visible=not is_second_phase_active), # 7 trim_button (visibility)
1187
+ gr.update(visible=not is_second_phase_active), # 8 undo_trim_button (visibility)
1188
+ gr.update(visible=not is_second_phase_active), # 9 delete_button (visibility)
1189
+ gr.update(visible=initial_load_tuple[7]['visible'], value=initial_load_tuple[8]), # 10 first_phase_accept_cb (vis from [7], val from [8])
1190
+ gr.update(visible=is_second_phase_active), # 11 approve_button (visibility)
1191
+ gr.update(visible=is_second_phase_active), # 12 reject_button (visibility)
1192
+ initial_load_tuple[0], # 13 current_page_idx_state (value)
1193
+ initial_load_tuple[1], # 14 current_idx_on_page_state (value)
1194
+ initial_load_tuple[2], # 15 audio_player (value or gr.update obj)
1195
+ initial_load_tuple[3], # 16 transcript_tb (can be same as 6, Gradio handles it)
1196
+ initial_load_tuple[5], # 17 status_md (value)
1197
+ initial_load_tuple[6] # 18 original_transcript_state (value)
 
 
 
 
1198
  )
1199
+ else:
1200
  CURRENT_USERNAME = None
1201
+ token = None # Clear global token if auth fails or user not allowed
1202
+ return _failed_login_outputs(f"User '{username}' not in allowed user list.", reviewer_text_val="Unauthorized")
1203
  except Exception as e:
1204
  CURRENT_USERNAME = None
1205
+ token = None # Clear global token on any login exception
1206
  import traceback
1207
  login_err_msg = f"Login failed: {str(e)}"
1208
  print(f"{login_err_msg}\n{traceback.format_exc()}")
1209
  return _failed_login_outputs(login_err_msg, reviewer_text_val="Login Error")
1210
 
1211
 
1212
+ # Gradio Interface (largely same as your previous version)
1213
  css = """
1214
  .white { background-color: white; color: black; } .yellow { background-color: yellow; color: black; }
1215
  .blue { background-color: lightblue; color: black; } .green { background-color: lightgreen; color: black; }
 
1219
  .reviewer-textbox input { text-align: center; font-weight: bold; }
1220
  """
1221
  with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
1222
+ # hf_token_state will store the token provided via UI and used for operations.
1223
+ # Initialize with env var 'token' if available, otherwise empty.
1224
+ # This gr.State is updated by the hf_login function's output.
1225
+ hf_token_state = gr.State(os.getenv("hf_token") or "")
1226
+
1227
  current_page_idx_state = gr.State(0)
1228
  current_idx_on_page_state = gr.State(0)
1229
  original_transcript_state = gr.State("")
1230
 
1231
  with gr.Column(visible=True, elem_id="login_container") as login_container:
1232
  gr.Markdown("## HF Authentication")
1233
+ # hf_token_input default value is also from env var, or empty.
1234
+ hf_token_input = gr.Textbox(label="Hugging Face Token", type="password", value=os.getenv("hf_token") or "")
1235
  login_button = gr.Button("Login")
1236
  login_message = gr.Markdown("")
1237
 
 
1269
  jump_text_tb = gr.Textbox(label="Jump to Global Index", placeholder="Enter dataset absolute index")
1270
  jump_button = gr.Button("Jump")
1271
  with gr.Row():
1272
+ # Default repo name will be updated more accurately if user logs in.
1273
+ # For now, a generic placeholder.
1274
+ hf_repo_name_tb = gr.Textbox(label="Export Repository Name (your_hf_username/dataset-name)", value="your-hf-username/my-annotated-asr-dataset")
1275
  hf_export_button = gr.Button("Export to Hugging Face", variant="primary")
1276
  hf_export_status_md = gr.Markdown("")
1277
 
1278
  # Outputs for login_button (19 outputs)
1279
  login_outputs = [
1280
+ login_container, main_container, reviewer_tb, hf_token_state, login_message, # 0-4
1281
+ save_next_button, transcript_tb, trim_button, undo_trim_button, delete_button, # 5-9
1282
+ first_phase_accept_cb, # 10 (this receives a gr.update obj with 'visible' and 'value' keys)
1283
+ approve_button, reject_button, # 11-12
1284
+ current_page_idx_state, current_idx_on_page_state, audio_player, # 13-15
1285
+ transcript_tb, # 16 (target for transcript value, can be same as #6)
1286
+ status_md, original_transcript_state # 17-18
1287
  ]
1288
  login_button.click(fn=hf_login, inputs=[hf_token_input], outputs=login_outputs)
1289
 
1290
+
1291
+ # Common outputs for navigation and actions that reload sample view (9 outputs from load_interface_data)
1292
+ # (page_idx_state, idx_on_page_state, audio_player, transcript_tb_update, reviewer_tb_update,
1293
+ # status_md, original_transcript_state, first_phase_accept_cb_vis_update, first_phase_accept_cb_val)
1294
  navigation_outputs_extended = [
1295
+ current_page_idx_state, current_idx_on_page_state, # States
1296
+ audio_player, transcript_tb, reviewer_tb, status_md, original_transcript_state, # UI components
1297
+ first_phase_accept_cb, # For visibility update (receives gr.update(visible=...))
1298
+ first_phase_accept_cb # For value update (receives value directly, Gradio checkbox handles it)
1299
  ]
1300
 
1301
  save_next_button.click(
 
1313
  inputs=[current_page_idx_state, current_idx_on_page_state],
1314
  outputs=navigation_outputs_extended
1315
  )
 
1316
  approve_button.click(
1317
  fn=review_and_next_sample_second_phase,
1318
  inputs=[current_page_idx_state, current_idx_on_page_state, gr.State("approved")],
 
1323
  inputs=[current_page_idx_state, current_idx_on_page_state, gr.State("rejected")],
1324
  outputs=navigation_outputs_extended
1325
  )
 
1326
  trim_button.click(
1327
  fn=trim_audio_action,
1328
  inputs=[current_page_idx_state, current_idx_on_page_state, trim_start_tb, trim_end_tb],
 
1336
  delete_button.click(
1337
  fn=confirm_delete_audio_action,
1338
  inputs=[current_page_idx_state, current_idx_on_page_state],
1339
+ outputs=navigation_outputs_extended
1340
  )
 
1341
  jump_button.click(
1342
  fn=jump_to_absolute_idx,
1343
  inputs=[jump_text_tb, current_page_idx_state, current_idx_on_page_state],
 
1345
  )
1346
  hf_export_button.click(
1347
  fn=export_to_huggingface,
1348
+ inputs=[hf_repo_name_tb, hf_token_state], # Use hf_token_state here
1349
  outputs=[hf_export_status_md],
1350
  queue=True
1351
  )
1352
 
1353
  if __name__ == "__main__":
1354
+ # Initializations that don't depend on login token can be here
1355
+ # For example, setting SECOND_PHASE based on an env var or config file.
1356
+ # However, total_samples and annotator_ranges should primarily be determined *after* login,
1357
+ # as they might depend on the dataset accessible by the user's token.
 
 
 
 
 
 
 
1358
 
1359
+ # Example: Override SECOND_PHASE for testing
1360
+ # os.environ['APP_SECOND_PHASE'] = "True"
1361
+ # SECOND_PHASE = os.getenv('APP_SECOND_PHASE', 'False').lower() == 'true'
1362
+
1363
+ print(f"Application starting. Second phase mode: {SECOND_PHASE}")
1364
+
1365
+ # Initial dataset info try (might fail if token needed and not globally set from env)
1366
+ # This is mostly for informational purposes before login, hf_login will do a more robust fetch.
1367
+ if total_samples <= 0:
1368
+ print("Main block: total_samples not yet set. Will be determined after login.")
1369
+
1370
  if SECOND_PHASE:
1371
  print("==== APPLICATION LAUNCHING IN SECOND PHASE (REVIEW MODE) ====")
1372
+ # Initialization of SECOND_PHASE_REVIEW_MAPPING will happen after login,
1373
+ # once total_samples and annotator_ranges are confirmed.
 
 
 
 
 
1374
  else:
1375
  print("==== APPLICATION LAUNCHING IN FIRST PHASE (ANNOTATION MODE) ====")
1376