navidved commited on
Commit
e5d52ac
·
verified ·
1 Parent(s): 79b15b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -1262
app.py CHANGED
@@ -1,1244 +1,3 @@
1
- import gradio as gr
2
- import os
3
- import json
4
- import pandas as pd
5
- from datasets import load_dataset, DatasetDict, Dataset, Audio
6
- from huggingface_hub import HfApi, whoami, login, hf_hub_download
7
- import tempfile
8
- import shutil
9
- import gc
10
- import time
11
- import psutil
12
- from pydub import AudioSegment
13
- import soundfile as sf
14
- from tenacity import retry, stop_after_attempt, wait_exponential
15
- import re
16
- import numpy as np
17
- from pydantic import BaseModel
18
- from typing import Optional, List, Tuple
19
- from datetime import datetime
20
-
21
- # Log in with Hugging Face token
22
- token = os.getenv("hf_token")
23
- if token:
24
- login(token)
25
- else:
26
- print("Warning: hf_token environment variable not set. Hugging Face Hub operations might fail.")
27
-
28
- # Configuration
29
- HF_DATASET_NAME = "navidved/channelb-raw-data"
30
- AUDIO_DIR = "audio"
31
- SAVE_PATH = "annotations.json"
32
- ALLOWED_USERS = ["vargha", "navidved", "userC"] # Added userC for testing 2nd phase with >1 annotator
33
- REVIEWERS = ["vargha"] # First phase reviewers
34
- ANNOTATORS = [user for user in ALLOWED_USERS if user not in REVIEWERS] # First phase annotators
35
- CURRENT_USERNAME = None
36
- PAGE_SIZE = 100 # Kept for pagination logic, though review might be sample by sample
37
- SAVE_INTERVAL = 10
38
-
39
- # --- SECOND PHASE CONFIGURATION ---
40
- SECOND_PHASE = False # Set to True to activate second phase review
41
- SECOND_PHASE_REVIEW_MAPPING = {} # Populated if SECOND_PHASE is True. Maps: reviewer_username -> original_annotator_username
42
- # Example: {"navidved": "userC"} means navidved reviews userC's work
43
-
44
- # Global state variables
45
- current_page = 0
46
- ds_iter = None
47
- current_page_data = None
48
- audio_backup = {}
49
- annotation_count = 0
50
- unsaved_changes = {} # Primarily for first phase
51
- total_samples = 0
52
- annotator_ranges = {} # Stores {annotator_username: (start_idx, end_idx)} for first phase
53
-
54
- # Pydantic data models
55
- class AudioTrim(BaseModel):
56
- start: float
57
- end: float
58
-
59
- class Annotation(BaseModel):
60
- annotator: str # Original annotator (first phase)
61
- annotated_subtitle: Optional[str] = None
62
- audio_trims: Optional[List[AudioTrim]] = None
63
-
64
- # First phase review fields
65
- is_first_phase_accepted: bool = False
66
- first_phase_reviewer_username: Optional[str] = None
67
-
68
- # Second phase review fields
69
- second_phase_reviewed_by: Optional[str] = None
70
- second_phase_review_status: Optional[str] = None # "approved" or "rejected"
71
- second_phase_review_timestamp: Optional[datetime] = None
72
-
73
- create_at: datetime
74
- update_at: datetime
75
-
76
- class Sample(BaseModel):
77
- id: int
78
- voice_name: str
79
- original_subtitle: str
80
- ignore_it: bool = False
81
- description: Optional[str] = None
82
- annotations: Optional[List[Annotation]] = None
83
- is_approved_in_second_phase: bool = False # True if the primary annotation is approved in 2nd phase
84
-
85
- class DatasetModel(BaseModel): # Renamed to avoid conflict with datasets.Dataset
86
- samples: Optional[List[Sample]] = None
87
-
88
- # Utility functions
89
- def load_saved_annotations():
90
- dataset_model = None
91
- if os.path.exists(SAVE_PATH):
92
- try:
93
- with open(SAVE_PATH, "r", encoding="utf-8") as f:
94
- data = json.load(f)
95
- dataset_model = DatasetModel(**data)
96
- print("Loaded annotations from local JSON file")
97
- except Exception as e:
98
- print(f"Error loading local JSON file: {str(e)}. Removing invalid file.")
99
- # os.remove(SAVE_PATH) # Be cautious with auto-removing
100
- dataset_model = None
101
-
102
-
103
- if dataset_model is None and token:
104
- try:
105
- hf_path = hf_hub_download(
106
- repo_id=HF_DATASET_NAME,
107
- filename=SAVE_PATH,
108
- repo_type="dataset",
109
- token=token
110
- )
111
- with open(hf_path, "r", encoding="utf-8") as f:
112
- data = json.load(f)
113
- dataset_model = DatasetModel(**data)
114
- # Cache it locally
115
- with open(SAVE_PATH, "w", encoding="utf-8") as f:
116
- f.write(dataset_model.model_dump_json(exclude_none=True, indent=4))
117
- print("Loaded annotations from HF dataset repository and cached locally")
118
- except Exception as e:
119
- print(f"Error loading JSON file from HF repo: {str(e)}")
120
- dataset_model = None
121
-
122
- if dataset_model is None:
123
- dataset_model = DatasetModel(samples=[])
124
- print("Created new empty DatasetModel for annotations")
125
-
126
- return dataset_model
127
-
128
- def save_annotations(dataset_model: DatasetModel):
129
- global annotation_count
130
- try:
131
- with open(SAVE_PATH, "w", encoding="utf-8") as f:
132
- f.write(dataset_model.model_dump_json(exclude_none=True, indent=4))
133
- print(f"Saved annotations to {SAVE_PATH}")
134
- annotation_count += 1
135
- if annotation_count % SAVE_INTERVAL == 0 and token:
136
- push_json_to_hf()
137
- except Exception as e:
138
- print(f"Error saving annotations: {str(e)}")
139
-
140
- def push_json_to_hf():
141
- if not token:
142
- print("Cannot push to HF: token not available.")
143
- return
144
- try:
145
- api = HfApi()
146
- api.upload_file(
147
- path_or_fileobj=SAVE_PATH,
148
- path_in_repo=SAVE_PATH,
149
- repo_type="dataset",
150
- repo_id=HF_DATASET_NAME,
151
- token=token
152
- )
153
- print("Uploaded annotations.json to Hugging Face repository")
154
- except Exception as e:
155
- print(f"Error uploading JSON file: {str(e)}")
156
-
157
- def calculate_annotator_ranges(total_samples_val, annotators_list):
158
- num_annotators = len(annotators_list)
159
- if num_annotators == 0 or total_samples_val <= 0:
160
- return {}
161
-
162
- samples_per_annotator = total_samples_val // num_annotators
163
- extra_samples = total_samples_val % num_annotators
164
-
165
- ranges = {}
166
- start = 0
167
- for i, annotator in enumerate(annotators_list):
168
- end = start + samples_per_annotator - 1
169
- if i < extra_samples:
170
- end += 1
171
- if end >= total_samples_val: # Ensure end does not exceed total_samples
172
- end = total_samples_val -1
173
- if start <= end : # Ensure start is not greater than end
174
- ranges[annotator] = (start, end)
175
- start = end + 1
176
- return ranges
177
-
178
- def initialize_second_phase_assignments():
179
- global SECOND_PHASE_REVIEW_MAPPING, annotator_ranges
180
- if not ANNOTATORS or len(ANNOTATORS) < 1: # Requires at least 1 annotator to review their own work, or 2 for cross-review
181
- print("Not enough annotators for second phase review.")
182
- SECOND_PHASE_REVIEW_MAPPING = {}
183
- return
184
-
185
- # Ensure annotator_ranges is populated
186
- if not annotator_ranges and total_samples > 0:
187
- print("Populating annotator_ranges for second phase initialization.")
188
- annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
189
-
190
-
191
- if len(ANNOTATORS) == 1:
192
- # Single annotator reviews their own work if that's the desired logic
193
- # Or, this phase might not apply. For now, let's allow self-review.
194
- annotator = ANNOTATORS[0]
195
- SECOND_PHASE_REVIEW_MAPPING[annotator] = annotator
196
- print(f"Second phase: {annotator} will review their own work.")
197
- else:
198
- # Cyclic assignment: annotator[i] reviews annotator[i-1]'s work
199
- for i, reviewer_user in enumerate(ANNOTATORS):
200
- original_annotator_idx = (i - 1 + len(ANNOTATORS)) % len(ANNOTATORS)
201
- original_annotator_user = ANNOTATORS[original_annotator_idx]
202
- SECOND_PHASE_REVIEW_MAPPING[reviewer_user] = original_annotator_user
203
- print(f"Second phase: {reviewer_user} will review {original_annotator_user}'s work.")
204
-
205
- # Verify that original annotators have ranges
206
- for reviewer, original_annotator in SECOND_PHASE_REVIEW_MAPPING.items():
207
- if original_annotator not in annotator_ranges:
208
- print(f"Warning: Original annotator {original_annotator} has no range defined in annotator_ranges.")
209
- # This could happen if total_samples was 0 or annotator_ranges wasn't calculated correctly.
210
-
211
- def get_user_allowed_range(username):
212
- global annotator_ranges, total_samples
213
- if SECOND_PHASE:
214
- if not SECOND_PHASE_REVIEW_MAPPING: # Ensure it's initialized
215
- initialize_second_phase_assignments()
216
-
217
- original_annotator_to_review = SECOND_PHASE_REVIEW_MAPPING.get(username)
218
- if original_annotator_to_review:
219
- # The user `username` is reviewing `original_annotator_to_review`'s work.
220
- # The range is the original work range of `original_annotator_to_review`.
221
- if not annotator_ranges and total_samples > 0: # Lazy init for ranges if needed
222
- annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
223
-
224
- return annotator_ranges.get(original_annotator_to_review)
225
- else: # User is not a designated reviewer in the second phase mapping
226
- return None # Or (0,-1) to signify no access
227
- else: # First Phase Logic
228
- if get_user_role(username) == "reviewer": # First phase reviewers see everything
229
- return (0, total_samples - 1) if total_samples > 0 else None
230
- elif username in annotator_ranges: # First phase annotators see their assigned range
231
- return annotator_ranges[username]
232
- else:
233
- return None
234
-
235
- def is_within_range(absolute_idx, allowed_range):
236
- if allowed_range is None:
237
- return False
238
- return allowed_range[0] <= absolute_idx <= allowed_range[1]
239
-
240
- def get_user_role(username): # This defines first-phase roles
241
- return "reviewer" if username in REVIEWERS else "annotator"
242
-
243
- def init_dataset_iterator():
244
- global ds_iter
245
- try:
246
- # It's better to load the dataset on demand rather than keeping an iterator open.
247
- # For streaming, iter(load_dataset(...)) is fine if used immediately.
248
- # ds = load_dataset(HF_DATASET_NAME, split="train", streaming=True)
249
- # ds_iter = iter(ds)
250
- return True
251
- except Exception as e:
252
- print(f"Error initializing dataset iterator: {e}")
253
- return False
254
-
255
- def load_page_data(page_num=0):
256
- global current_page_data, current_page, total_samples
257
-
258
- # For streaming, we re-fetch and skip.
259
- try:
260
- ds = load_dataset(HF_DATASET_NAME, split="train", streaming=True)
261
- temp_ds_iter = iter(ds)
262
- except Exception as e:
263
- print(f"Error loading dataset for page data: {e}")
264
- current_page_data = pd.DataFrame(columns=["audio", "sentence", "id_within_page"])
265
- return current_page_data
266
-
267
- # Determine the actual range of samples the user can see
268
- # This needs to be based on the full dataset indices, not just page logic
269
- allowed_range = get_user_allowed_range(CURRENT_USERNAME)
270
- if not allowed_range:
271
- print(f"User {CURRENT_USERNAME} has no allowed range.")
272
- current_page_data = pd.DataFrame(columns=["audio", "sentence", "id_within_page"])
273
- return current_page_data
274
-
275
- # Calculate start and end absolute indices for the requested page, clipped by allowed_range
276
- page_start_abs_idx = page_num * PAGE_SIZE
277
- page_end_abs_idx = page_start_abs_idx + PAGE_SIZE - 1
278
-
279
- # Effective start and end for fetching, considering user's total allowed range
280
- effective_start_idx = max(page_start_abs_idx, allowed_range[0])
281
- effective_end_idx = min(page_end_abs_idx, allowed_range[1])
282
-
283
- samples_on_page = []
284
- current_absolute_idx = 0
285
-
286
- # Iterate through the dataset to find samples within the effective range for this page
287
- # This can be slow for large datasets and large page_num with streaming.
288
- # A non-streaming dataset or a more optimized way to seek would be better for large scale.
289
-
290
- idx_counter_for_page = 0
291
- for i, sample_data in enumerate(temp_ds_iter):
292
- current_absolute_idx = i # Absolute index in the full dataset
293
-
294
- if current_absolute_idx > effective_end_idx :
295
- break # Past the samples needed for this page and user range
296
-
297
- if current_absolute_idx >= effective_start_idx:
298
- # This sample is within the user's allowed range and on the current conceptual page
299
- sample_data['absolute_idx'] = current_absolute_idx
300
- sample_data['id_within_page'] = idx_counter_for_page # relative index on current page view
301
- samples_on_page.append(sample_data)
302
- idx_counter_for_page +=1
303
- if len(samples_on_page) >= PAGE_SIZE : # Filled the page
304
- break
305
-
306
- current_page = page_num
307
- if samples_on_page:
308
- current_page_data = pd.DataFrame(samples_on_page)
309
- else:
310
- # If no samples found (e.g., page is outside effective range)
311
- current_page_data = pd.DataFrame(columns=["audio", "sentence", "id_within_page", "absolute_idx"])
312
- print(f"No samples found for user {CURRENT_USERNAME} on page {page_num} within effective range {effective_start_idx}-{effective_end_idx}")
313
-
314
- gc.collect()
315
- return current_page_data
316
-
317
-
318
- def get_dataset_info():
319
- global total_samples # Use global total_samples
320
- if total_samples > 0: # If already fetched
321
- return {'num_samples': total_samples}
322
- try:
323
- # Temporarily load to get info, can be slow for huge datasets if not streaming
324
- # For streaming, num_examples might be None or -1, so actual iteration might be needed
325
- info = load_dataset(HF_DATASET_NAME, streaming=True, split="train").info
326
- # The 'num_examples' for a streaming dataset split might not be accurate or available.
327
- # It's often -1 or None. You might need a way to get the true total count if it's crucial.
328
- # For now, we'll use it if available, otherwise, it remains a challenge for pure streaming.
329
- if hasattr(info, 'estimated_size') and info.estimated_size is not None: # Check an alternative if num_examples is not good
330
- pass # Not directly number of samples
331
-
332
- # Fallback: iterate to count if num_examples is not reliable
333
- # This is very inefficient and should be avoided if possible.
334
- # A pre-calculated count or a different dataset split might be needed.
335
- # For this example, we'll assume info.splits['train'].num_examples is somewhat usable
336
- # or that a fixed total_samples is set if this is problematic.
337
-
338
- # Simplified: try to get from info, but acknowledge limitations
339
- ds_info_obj = load_dataset(HF_DATASET_NAME, split="train") # Load non-streaming for info
340
- num_samples_val = ds_info_obj.num_rows
341
- if num_samples_val and num_samples_val > 0:
342
- total_samples = num_samples_val
343
- return {'num_samples': total_samples}
344
-
345
- # If still no count, this is an issue for range calculations.
346
- # For now, return -1, but this will break range logic.
347
- print("Warning: Could not reliably determine total_samples from dataset info.")
348
- return {'num_samples': -1}
349
-
350
- except Exception as e:
351
- print(f"Error getting dataset info: {e}")
352
- return {'num_samples': -1}
353
-
354
-
355
- # Initial data load (moved after functions it calls are defined)
356
- # init_dataset_iterator() # Iterator not maintained globally anymore for streaming robustness
357
- dataset_info = get_dataset_info() # This sets global total_samples
358
- if total_samples > 0:
359
- annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
360
- if SECOND_PHASE:
361
- initialize_second_phase_assignments() # Initialize after annotator_ranges might be populated
362
- else:
363
- print("Warning: total_samples is not positive. Annotation ranges and second phase assignments may be incorrect.")
364
- annotator_ranges = {}
365
-
366
- # Load first page data for the initial user if any
367
- # This should happen after login when CURRENT_USERNAME is set.
368
- # current_page_data = load_page_data(0) # Moved to hf_login success path
369
-
370
-
371
- def get_audio_path(audio_entry):
372
- if isinstance(audio_entry, dict):
373
- if "array" in audio_entry and "sampling_rate" in audio_entry:
374
- return (audio_entry["sampling_rate"], audio_entry["array"]) # Return tuple for direct use
375
- return audio_entry.get("path", None)
376
- if isinstance(audio_entry, str):
377
- if audio_entry.startswith("http://") or audio_entry.startswith("https://"):
378
- return audio_entry # URL
379
- if os.path.exists(audio_entry): # Absolute path
380
- return audio_entry
381
- # Relative path (try joining with AUDIO_DIR if one is configured)
382
- if AUDIO_DIR:
383
- joined_path = os.path.join(AUDIO_DIR, audio_entry)
384
- if os.path.exists(joined_path):
385
- return joined_path
386
- return audio_entry # Return as is, might be a relative path resolvable by datasets
387
- return None # Or handle unknown type
388
-
389
- # Core functions
390
- def save_sample_data(page_idx, idx_on_page, transcript, current_user_performing_action, accepted_flag=False):
391
- global current_page_data, unsaved_changes
392
-
393
- if current_page_data is None or idx_on_page >= len(current_page_data):
394
- return "Invalid index or data not loaded for current page."
395
-
396
- actual_sample_info = current_page_data.iloc[idx_on_page]
397
- absolute_idx = actual_sample_info['absolute_idx']
398
-
399
- # First phase saving logic
400
- allowed_range = get_user_allowed_range(current_user_performing_action)
401
- if not is_within_range(absolute_idx, allowed_range) and not SECOND_PHASE: # In 2nd phase, this check is implicitly handled by page loading
402
- return "You are not allowed to annotate this sample (out of range)."
403
-
404
- audio_entry_original = actual_sample_info["audio"] # This might be path or dict
405
- voice_name = os.path.basename(str(get_audio_path(audio_entry_original) or f"sample_{absolute_idx}"))
406
-
407
- dataset_model = load_saved_annotations()
408
- sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
409
-
410
- if not sample:
411
- sample = Sample(
412
- id=absolute_idx,
413
- voice_name=voice_name,
414
- original_subtitle=actual_sample_info["sentence"],
415
- annotations=[]
416
- )
417
- dataset_model.samples = dataset_model.samples or []
418
- dataset_model.samples.append(sample)
419
-
420
- now = datetime.now()
421
- # In the first phase, current_user_performing_action is the annotator or reviewer.
422
- # 'accepted_flag' is used if current_user_performing_action is a first-phase reviewer.
423
- annotation = next((a for a in sample.annotations or [] if a.annotator == current_user_performing_action), None)
424
-
425
- if annotation:
426
- annotation.annotated_subtitle = transcript.strip()
427
- annotation.update_at = now
428
- if get_user_role(current_user_performing_action) == "reviewer": # First phase reviewer
429
- annotation.is_first_phase_accepted = accepted_flag
430
- annotation.first_phase_reviewer_username = current_user_performing_action if accepted_flag else None
431
- else:
432
- new_annotation_data = {
433
- "annotator": current_user_performing_action,
434
- "annotated_subtitle": transcript.strip(),
435
- "create_at": now,
436
- "update_at": now,
437
- "is_first_phase_accepted": False # Default
438
- }
439
- if get_user_role(current_user_performing_action) == "reviewer":
440
- new_annotation_data["is_first_phase_accepted"] = accepted_flag
441
- if accepted_flag:
442
- new_annotation_data["first_phase_reviewer_username"] = current_user_performing_action
443
-
444
- annotation = Annotation(**new_annotation_data)
445
- sample.annotations = sample.annotations or []
446
- sample.annotations.append(annotation)
447
-
448
- if absolute_idx in unsaved_changes:
449
- del unsaved_changes[absolute_idx]
450
-
451
- save_annotations(dataset_model)
452
- return f"✓ Saved annotation for sample {absolute_idx}"
453
-
454
- def handle_second_phase_action(page_idx, idx_on_page, action: str): # action is "approved" or "rejected"
455
- global current_page_data, CURRENT_USERNAME
456
-
457
- if not SECOND_PHASE:
458
- return "Not in second phase."
459
- if current_page_data is None or idx_on_page >= len(current_page_data):
460
- return "Invalid index or data not loaded for current page (second phase)."
461
-
462
- actual_sample_info = current_page_data.iloc[idx_on_page]
463
- absolute_idx = actual_sample_info['absolute_idx']
464
-
465
- original_annotator_to_review = SECOND_PHASE_REVIEW_MAPPING.get(CURRENT_USERNAME)
466
- if not original_annotator_to_review:
467
- return "You are not assigned to review any user's work."
468
-
469
- dataset_model = load_saved_annotations()
470
- sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
471
- if not sample:
472
- # This case should ideally not happen if data is consistent.
473
- # If it does, it means the sample exists in source dataset but not annotations.json.
474
- # A reviewer in 2nd phase is reviewing existing annotation.
475
- return f"Error: Sample {absolute_idx} not found in annotations.json for review."
476
-
477
- # Find the annotation made by the original_annotator_to_review
478
- annotation_to_review = next((ann for ann in sample.annotations or [] if ann.annotator == original_annotator_to_review), None)
479
-
480
- if not annotation_to_review:
481
- # If original annotator did not make an annotation for this sample.
482
- # Option 1: Create a placeholder annotation based on original_subtitle and review that.
483
- # Option 2: Report error. For now, report error.
484
- # This implies the first phase annotator skipped this item or it wasn't in their range correctly.
485
- print(f"Warning: No prior annotation by {original_annotator_to_review} for sample {absolute_idx}. Reviewing original subtitle implicitly.")
486
- # Let's create one if missing, based on original subtitle
487
- annotation_to_review = Annotation(
488
- annotator=original_annotator_to_review,
489
- annotated_subtitle=sample.original_subtitle, # Use original subtitle
490
- create_at=sample.annotations[0].create_at if sample.annotations else datetime.now(), # Approx original creation
491
- update_at=datetime.now()
492
- )
493
- sample.annotations = sample.annotations or []
494
- sample.annotations.append(annotation_to_review)
495
-
496
-
497
- annotation_to_review.second_phase_reviewed_by = CURRENT_USERNAME
498
- annotation_to_review.second_phase_review_status = action
499
- annotation_to_review.second_phase_review_timestamp = datetime.now()
500
- annotation_to_review.update_at = datetime.now()
501
-
502
- if action == "approved":
503
- sample.is_approved_in_second_phase = True
504
- # If rejected, is_approved_in_second_phase could be set to False, or depend on other conditions.
505
- # For now, only explicit approval sets it to True.
506
-
507
- save_annotations(dataset_model)
508
- return f"✓ Review ({action}) saved for sample {absolute_idx} (Original annotator: {original_annotator_to_review})"
509
-
510
-
511
- def get_sample(page_idx, idx_on_page, current_user_displaying): # current_user_displaying is CURRENT_USERNAME
512
- global current_page_data, unsaved_changes, total_samples
513
-
514
- if current_page_data is None or idx_on_page < 0 or idx_on_page >= len(current_page_data):
515
- return None, "", f"Invalid index. Range is 0-{len(current_page_data)-1}", "unreviewed", "white", True, False, ""
516
-
517
- actual_sample_info = current_page_data.iloc[idx_on_page]
518
- absolute_idx = actual_sample_info['absolute_idx']
519
-
520
- audio_entry_original = actual_sample_info["audio"]
521
- audio_val = get_audio_path(audio_entry_original)
522
-
523
- default_transcript = actual_sample_info["sentence"]
524
- transcript_to_display = default_transcript
525
-
526
- # UI states
527
- ui_reviewer_field = "unreviewed" # Textbox showing who annotated/reviewed
528
- ui_color = "white"
529
- ui_editable = True # Transcript text area
530
- ui_is_accepted_flag = False # For first phase checkmark logic, or second phase display
531
- ui_status_message = f"Sample {absolute_idx+1}"
532
- if total_samples > 0:
533
- ui_status_message += f" of {total_samples}"
534
-
535
- dataset_model = load_saved_annotations()
536
- sample_from_json = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
537
-
538
- if sample_from_json:
539
- if sample_from_json.ignore_it:
540
- audio_val = None
541
- transcript_to_display = "AUDIO DELETED (This audio has been removed.)"
542
- ui_reviewer_field = "deleted"
543
- ui_color = "red"
544
- ui_editable = False
545
-
546
- elif SECOND_PHASE:
547
- ui_editable = False # Transcript not editable in 2nd phase
548
- original_annotator_being_reviewed = SECOND_PHASE_REVIEW_MAPPING.get(current_user_displaying)
549
-
550
- if not original_annotator_being_reviewed: # Should not happen if UI is controlled properly
551
- transcript_to_display = "Error: User not in review mapping."
552
- ui_color = "red"
553
- else:
554
- ui_reviewer_field = f"Reviewing: {original_annotator_being_reviewed}"
555
- annotation_under_review = next((ann for ann in sample_from_json.annotations or [] if ann.annotator == original_annotator_being_reviewed), None)
556
-
557
- if annotation_under_review:
558
- transcript_to_display = annotation_under_review.annotated_subtitle or default_transcript
559
- ui_is_accepted_flag = (annotation_under_review.second_phase_review_status == "approved")
560
-
561
- if annotation_under_review.second_phase_reviewed_by:
562
- if annotation_under_review.second_phase_reviewed_by == current_user_displaying:
563
- ui_color = "green" if annotation_under_review.second_phase_review_status == "approved" else "orange" # orange for rejected by current user
564
- else: # Reviewed by someone else
565
- ui_color = "gray"
566
- ui_reviewer_field += f" (Reviewed by {annotation_under_review.second_phase_reviewed_by})"
567
- else: # Pending review by current_user_displaying
568
- ui_color = "yellow" # Indicates pending current user's review
569
- else: # No annotation from original annotator for this sample
570
- transcript_to_display = default_transcript # Show original dataset subtitle
571
- ui_reviewer_field += " (Original annotator made no submission)"
572
- ui_color = "lightgray" # Needs review, but based on original
573
-
574
- else: # First Phase Logic
575
- # Check for an accepted annotation by a first-phase reviewer
576
- accepted_first_phase_annotation = next((a for a in sample_from_json.annotations or [] if a.is_first_phase_accepted and a.first_phase_reviewer_username), None)
577
-
578
- if accepted_first_phase_annotation:
579
- transcript_to_display = accepted_first_phase_annotation.annotated_subtitle or default_transcript
580
- ui_reviewer_field = accepted_first_phase_annotation.first_phase_reviewer_username
581
- ui_color = "green"
582
- ui_is_accepted_flag = True
583
- ui_editable = (get_user_role(current_user_displaying) == "reviewer") # Only 1st phase reviewer can edit accepted
584
- else:
585
- # Check for annotation by the current user (annotator or reviewer)
586
- user_specific_annotation = next((a for a in sample_from_json.annotations or [] if a.annotator == current_user_displaying), None)
587
- if user_specific_annotation:
588
- transcript_to_display = user_specific_annotation.annotated_subtitle or default_transcript
589
- ui_reviewer_field = user_specific_annotation.annotator
590
- ui_color = "yellow" if absolute_idx not in unsaved_changes else "pink"
591
- ui_editable = True
592
- else:
593
- # Check for annotations by other annotators (not current user, not accepted by reviewer)
594
- # Display the first one found for a reviewer to potentially act on, or inform annotator
595
- other_annotations = [a for a in sample_from_json.annotations or [] if a.annotator != current_user_displaying and not a.is_first_phase_accepted]
596
- if other_annotations:
597
- # If current user is a reviewer, they see the other annotator's work
598
- if get_user_role(current_user_displaying) == "reviewer":
599
- other_ann_to_show = other_annotations[0]
600
- transcript_to_display = other_ann_to_show.annotated_subtitle or default_transcript
601
- ui_reviewer_field = other_ann_to_show.annotator
602
- ui_color = "blue" # Reviewer sees other's work
603
- ui_editable = True
604
- else: # Current user is an annotator, and another annotator worked on it
605
- # This state is a bit ambiguous. Default to original if not assigned to this user.
606
- # For simplicity, show original if it's not their saved work.
607
- transcript_to_display = default_transcript
608
- ui_reviewer_field = "labeled by another annotator"
609
- ui_color = "lightblue"
610
- ui_editable = False # Annotator cannot edit other annotator's unreviewed work
611
- else: # No annotations at all, or only unreviewed by others and user is annotator
612
- if absolute_idx in unsaved_changes:
613
- transcript_to_display = unsaved_changes[absolute_idx]
614
- ui_reviewer_field = current_user_displaying
615
- ui_color = "pink"
616
- ui_editable = True
617
- # else, default_transcript, unreviewed, white, editable=True (already set)
618
-
619
- # If no sample_from_json, then it's a fresh sample from dataset
620
- # transcript_to_display remains default_transcript. ui states remain default.
621
- # This case is hit if annotations.json doesn't have this absolute_idx yet.
622
-
623
- # Status message update
624
- current_page_for_status = page_idx + 1 # page_idx is 0-indexed
625
- # If current_page_data has 'absolute_idx', we can use that
626
- # page_num_from_abs = (absolute_idx // PAGE_SIZE) + 1
627
-
628
- ui_status_message = f"{ui_status_message} - Page {current_page_for_status}"
629
- if SECOND_PHASE :
630
- ui_status_message += " (Review Phase)"
631
- else:
632
- ui_status_message += " (Annotation Phase)"
633
-
634
-
635
- return audio_val, transcript_to_display, ui_status_message, ui_reviewer_field, ui_color, ui_editable, ui_is_accepted_flag, default_transcript
636
-
637
-
638
- def load_interface_data(page_idx, idx_on_page): # Renamed from load_interface to avoid conflict
639
- # This function is primarily a wrapper around get_sample for UI updates
640
- audio, text, base_status, saved_reviewer_text, color, editable, accepted_flag, original_dataset_text = get_sample(page_idx, idx_on_page, CURRENT_USERNAME)
641
-
642
- # Audio backup logic (can be simplified or removed if not strictly needed for undo_trim)
643
- # absolute_idx = current_page_data.iloc[idx_on_page]['absolute_idx'] if current_page_data is not None and idx_on_page < len(current_page_data) else -1
644
- # audio_entry_original = current_page_data.iloc[idx_on_page]["audio"] if current_page_data is not None and idx_on_page < len(current_page_data) else ""
645
- # key = f"{absolute_idx}_{os.path.basename(str(get_audio_path(audio_entry_original) or 'unknown'))}"
646
- # if key not in audio_backup and audio is not None: # Backup the audio playable value
647
- # audio_backup[key] = audio
648
-
649
- return (
650
- page_idx, # current_page_idx state
651
- idx_on_page, # current_idx_on_page state
652
- audio, # audio_player value
653
- gr.update(value=text, interactive=editable), # transcript update
654
- gr.update(value=saved_reviewer_text, elem_classes=[color]), # reviewer Textbox update
655
- base_status, # status markdown update
656
- original_dataset_text # original_transcript state
657
- )
658
-
659
- # Navigation functions
660
- def navigate_sample(page_idx, idx_on_page, direction: int): # direction: 1 for next, -1 for prev
661
- global current_page_data, total_samples
662
-
663
- if current_page_data is None or len(current_page_data) == 0:
664
- return page_idx, idx_on_page, gr.update(), gr.update(), gr.update(), "No data loaded.", gr.update()
665
-
666
- target_idx_on_page = idx_on_page + direction
667
-
668
- new_page_idx = page_idx
669
- new_idx_on_page = target_idx_on_page
670
-
671
- if target_idx_on_page < 0: # Need to go to previous page
672
- if page_idx > 0:
673
- new_page_idx = page_idx - 1
674
- # Load new page data and set index to last item
675
- temp_data = load_page_data(new_page_idx)
676
- if temp_data is not None and not temp_data.empty:
677
- new_idx_on_page = len(temp_data) - 1
678
- else: # Previous page is empty or out of allowed range
679
- return page_idx, idx_on_page, gr.update(), gr.update(), gr.update(), "No more samples in this direction (prev page).", gr.update()
680
- else: # Already on first item of first page
681
- return page_idx, idx_on_page, gr.update(), gr.update(), gr.update(), "At the beginning of your assigned samples.", gr.update()
682
-
683
- elif target_idx_on_page >= len(current_page_data): # Need to go to next page
684
- new_page_idx = page_idx + 1
685
- temp_data = load_page_data(new_page_idx) # load_page_data updates current_page_data
686
- if temp_data is not None and not temp_data.empty:
687
- new_idx_on_page = 0
688
- else: # Next page is empty or out of allowed range
689
- # Check if we are at the very end of the allowed samples
690
- allowed_range = get_user_allowed_range(CURRENT_USERNAME)
691
- current_abs_idx = current_page_data.iloc[idx_on_page]['absolute_idx']
692
- if allowed_range and current_abs_idx >= allowed_range[1]:
693
- return page_idx, idx_on_page, gr.update(), gr.update(), gr.update(), "At the end of your assigned samples.", gr.update()
694
- else:
695
- return page_idx, idx_on_page, gr.update(), gr.update(), gr.update(), "No more samples in this direction (next page).", gr.update()
696
-
697
- # If we switched page, current_page_data is already updated by load_page_data.
698
- # If staying on same page, it's fine.
699
- return load_interface_data(new_page_idx, new_idx_on_page)
700
-
701
-
702
- def go_next_sample_wrapper(page_idx, idx_on_page): # Simpler wrapper for UI
703
- return navigate_sample(page_idx, idx_on_page, 1)
704
-
705
- def go_prev_sample_wrapper(page_idx, idx_on_page): # Simpler wrapper for UI
706
- return navigate_sample(page_idx, idx_on_page, -1)
707
-
708
-
709
- def save_and_next_sample_first_phase(page_idx, idx_on_page, current_text, is_accepted_by_reviewer_flag):
710
- # Note: `current_annotator_ui` (reviewer textbox value) is not who is performing action.
711
- # CURRENT_USERNAME is performing the action.
712
- # `is_accepted_by_reviewer_flag` is the checkbox state (true/false) if user is a reviewer.
713
- # If user is an annotator, this flag might not be directly applicable or always false from UI.
714
-
715
- # Determine if the current user is acting as a first-phase reviewer to use the 'accepted' flag
716
- user_is_reviewer = get_user_role(CURRENT_USERNAME) == "reviewer"
717
- save_msg = save_sample_data(page_idx, idx_on_page, current_text, CURRENT_USERNAME,
718
- accepted_flag=is_accepted_by_reviewer_flag if user_is_reviewer else False)
719
- print(save_msg) # Log save message
720
- # Then navigate
721
- return navigate_sample(page_idx, idx_on_page, 1)
722
-
723
-
724
- def review_and_next_sample_second_phase(page_idx, idx_on_page, review_action: str):
725
- feedback_msg = handle_second_phase_action(page_idx, idx_on_page, review_action)
726
- print(feedback_msg) # Log feedback message
727
- # Then navigate
728
- return navigate_sample(page_idx, idx_on_page, 1)
729
-
730
-
731
- def jump_to_absolute_idx(target_abs_idx_str, current_page_idx, current_idx_on_page): # Removed unused text/annotator params
732
- global current_page_data
733
- try:
734
- target_abs_idx = int(target_abs_idx_str)
735
- if target_abs_idx < 0: target_abs_idx = 0
736
-
737
- allowed_range = get_user_allowed_range(CURRENT_USERNAME)
738
- if not is_within_range(target_abs_idx, allowed_range):
739
- status_msg = f"Target index {target_abs_idx} is outside your assigned range {allowed_range}."
740
- # Return current state with error message
741
- audio, text, _, rev, color, edit, acc, orig_txt = get_sample(current_page_idx, current_idx_on_page, CURRENT_USERNAME)
742
- return current_page_idx, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt
743
-
744
- new_page_idx = target_abs_idx // PAGE_SIZE
745
- new_idx_on_page_conceptual = target_abs_idx % PAGE_SIZE # This is index on the conceptual new page
746
-
747
- # Load data for the new page
748
- temp_page_data = load_page_data(new_page_idx) # This updates global current_page_data
749
-
750
- if temp_page_data is None or temp_page_data.empty:
751
- status_msg = f"No data found for page {new_page_idx} containing index {target_abs_idx}."
752
- audio, text, _, rev, color, edit, acc, orig_txt = get_sample(current_page_idx, current_idx_on_page, CURRENT_USERNAME)
753
- return current_page_idx, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt
754
-
755
- # Find the actual index on the loaded page for target_abs_idx
756
- # The loaded page might not start exactly at new_page_idx * PAGE_SIZE if user's range is small.
757
- # `load_page_data` now adds 'absolute_idx' and 'id_within_page' to `current_page_data`
758
-
759
- # Find the row with the matching absolute_idx in the newly loaded current_page_data
760
- matching_rows = current_page_data[current_page_data['absolute_idx'] == target_abs_idx]
761
- if not matching_rows.empty:
762
- new_idx_on_page_actual = matching_rows.index[0] # This is the DataFrame index, should be same as 'id_within_page'
763
- else:
764
- # This means target_abs_idx, though in allowed_range, was not on the loaded page (e.g. page is sparse due to filtering)
765
- # Fallback: load the first item of the page if target not found directly.
766
- # Or better, report an issue.
767
- status_msg = f"Index {target_abs_idx} is in range, but not found on page {new_page_idx}. Displaying start of page."
768
- print(status_msg) # Log this
769
- new_idx_on_page_actual = 0 # Default to first item of the loaded page
770
- if current_page_data.empty : # Page is actually empty
771
- audio, text, _, rev, color, edit, acc, orig_txt = get_sample(current_page_idx, current_idx_on_page, CURRENT_USERNAME) # Revert to old view
772
- return current_page_idx, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt
773
-
774
- return load_interface_data(new_page_idx, new_idx_on_page_actual)
775
-
776
- except ValueError:
777
- status_msg = "Invalid index format for jump."
778
- audio, text, _, rev, color, edit, acc, orig_txt = get_sample(current_page_idx, current_idx_on_page, CURRENT_USERNAME)
779
- return current_page_idx, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt
780
- except Exception as e:
781
- status_msg = f"Error jumping to index: {e}"
782
- print(status_msg)
783
- audio, text, _, rev, color, edit, acc, orig_txt = get_sample(current_page_idx, current_idx_on_page, CURRENT_USERNAME)
784
- return current_page_idx, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt
785
-
786
-
787
- # Audio editing functions (simplifying, assuming these are for phase 1 only)
788
- def trim_audio_action(page_idx, idx_on_page, trim_start_str, trim_end_str):
789
- # This function would need significant rework if used with the new get_sample returns.
790
- # For now, let's assume it's for phase 1 and we fetch audio path differently or disable in phase 2.
791
- # For simplicity in this modification, advanced audio ops might be limited.
792
- if SECOND_PHASE: return page_idx, idx_on_page, gr.Audio(), gr.Textbox(), gr.Textbox(), "Trimming disabled in Review Phase.", gr.Textbox()
793
-
794
- # Simplified: fetch audio path if possible
795
- audio_val, transcript, base_status, saved_reviewer, color, editable, accepted, _ = get_sample(page_idx, idx_on_page, CURRENT_USERNAME)
796
-
797
- if not isinstance(audio_val, str) or not os.path.exists(audio_val):
798
- # Try to get original path from current_page_data for non-raw audio
799
- if current_page_data is not None and idx_on_page < len(current_page_data):
800
- audio_entry = current_page_data.iloc[idx_on_page]["audio"]
801
- resolved_path = get_audio_path(audio_entry)
802
- if isinstance(resolved_path, str) and os.path.exists(resolved_path):
803
- audio_val = resolved_path
804
- else: # If it's raw audio data (tuple) or URL, or non-existent path
805
- return page_idx, idx_on_page, audio_val, transcript, saved_reviewer, "Trimming not supported for this audio format or it's not a local file.", transcript
806
- else:
807
- return page_idx, idx_on_page, audio_val, transcript, saved_reviewer, "Audio data not available for trimming.", transcript
808
-
809
-
810
- absolute_idx = current_page_data.iloc[idx_on_page]['absolute_idx']
811
- voice_name_original = os.path.basename(str(current_page_data.iloc[idx_on_page]["audio"]))
812
-
813
-
814
- try:
815
- audio_seg = AudioSegment.from_file(audio_val)
816
- start_ms = int(float(trim_start_str) * 1000)
817
- end_ms = int(float(trim_end_str) * 1000)
818
- trimmed_seg = audio_seg[start_ms:end_ms]
819
-
820
- os.makedirs("trimmed_audio", exist_ok=True)
821
- trimmed_filename = f"trimmed_{absolute_idx}_{voice_name_original}"
822
- # Ensure unique extension, wav is usually safe
823
- if not trimmed_filename.lower().endswith(('.wav', '.mp3', '.flac')):
824
- trimmed_filename += ".wav"
825
- trimmed_path = os.path.join("trimmed_audio", trimmed_filename)
826
-
827
- # Export format might need to match original or be a standard like wav
828
- export_format = os.path.splitext(trimmed_path)[1][1:]
829
- if not export_format: export_format = "wav" # Default if no extension
830
-
831
- trimmed_seg.export(trimmed_path, format=export_format)
832
-
833
- dataset_model = load_saved_annotations()
834
- sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
835
- if not sample: # Should exist if we are editing it
836
- return page_idx, idx_on_page, audio_val, transcript, saved_reviewer, "Error: Sample not found in annotations for trimming.", transcript
837
-
838
- now = datetime.now()
839
- # Associate trim with current user's annotation for this sample
840
- annotation = next((a for a in sample.annotations or [] if a.annotator == CURRENT_USERNAME), None)
841
- if not annotation: # Create if doesn't exist
842
- annotation = Annotation(
843
- annotator=CURRENT_USERNAME,
844
- annotated_subtitle=transcript, # Current transcript
845
- audio_trims=[AudioTrim(start=float(trim_start_str), end=float(trim_end_str))],
846
- create_at=now,
847
- update_at=now
848
- )
849
- sample.annotations = sample.annotations or []
850
- sample.annotations.append(annotation)
851
- else:
852
- annotation.audio_trims = [AudioTrim(start=float(trim_start_str), end=float(trim_end_str))]
853
- annotation.update_at = now
854
-
855
- save_annotations(dataset_model)
856
- new_status = f"{base_status} [Trimmed]"
857
- return page_idx, idx_on_page, trimmed_path, transcript, saved_reviewer, new_status, transcript
858
- except Exception as e:
859
- return page_idx, idx_on_page, audio_val, transcript, saved_reviewer, f"Error trimming audio: {str(e)}", transcript
860
-
861
-
862
- def undo_trim_action(page_idx, idx_on_page):
863
- if SECOND_PHASE: return page_idx, idx_on_page, gr.Audio(), gr.Textbox(), gr.Textbox(), "Undo Trim disabled in Review Phase.", gr.Textbox()
864
-
865
- audio_val, transcript, base_status, saved_reviewer, color, editable, accepted, _ = get_sample(page_idx, idx_on_page, CURRENT_USERNAME)
866
- absolute_idx = current_page_data.iloc[idx_on_page]['absolute_idx']
867
- voice_name_original = os.path.basename(str(current_page_data.iloc[idx_on_page]["audio"]))
868
-
869
- dataset_model = load_saved_annotations()
870
- sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
871
- if sample:
872
- annotation = next((a for a in sample.annotations or [] if a.annotator == CURRENT_USERNAME), None) # Trim is user-specific
873
- if annotation and annotation.audio_trims:
874
- annotation.audio_trims = None
875
- annotation.update_at = datetime.now()
876
- save_annotations(dataset_model)
877
-
878
- # Restore original audio from backup or re-fetch from source dataset info
879
- original_audio_path_or_data = current_page_data.iloc[idx_on_page]["audio"] # This is the source entry
880
- restored_audio_val = get_audio_path(original_audio_path_or_data)
881
-
882
- # key = f"{absolute_idx}_{voice_name_original}"
883
- # orig_audio_backup = audio_backup.get(key) # Fetch from backup if available
884
- # if not orig_audio_backup: # If not in backup, use the path from current_page_data
885
- # orig_audio_backup = get_audio_path(current_page_data.iloc[idx_on_page]["audio"])
886
-
887
- new_status = f"{base_status} [Trim undone]"
888
- return page_idx, idx_on_page, restored_audio_val, transcript, saved_reviewer, new_status, transcript
889
-
890
-
891
- def confirm_delete_audio_action(page_idx, idx_on_page):
892
- if SECOND_PHASE: return page_idx, idx_on_page, gr.Audio(), gr.Textbox(), gr.Textbox(), "Delete disabled in Review Phase.", gr.Textbox()
893
-
894
- absolute_idx = current_page_data.iloc[idx_on_page]['absolute_idx']
895
- voice_name_original = os.path.basename(str(current_page_data.iloc[idx_on_page]["audio"]))
896
-
897
- dataset_model = load_saved_annotations()
898
- sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
899
- if not sample:
900
- sample = Sample(
901
- id=absolute_idx,
902
- voice_name=voice_name_original,
903
- original_subtitle=current_page_data.iloc[idx_on_page]["sentence"],
904
- annotations=[]
905
- )
906
- dataset_model.samples = dataset_model.samples or []
907
- dataset_model.samples.append(sample)
908
-
909
- sample.ignore_it = True
910
- now = datetime.now()
911
- # Create/update an annotation by CURRENT_USERNAME to mark this action
912
- annotation = next((a for a in sample.annotations or [] if a.annotator == CURRENT_USERNAME), None)
913
- deleted_text_marker = "AUDIO DELETED (This audio has been removed.)"
914
- if annotation:
915
- annotation.annotated_subtitle = deleted_text_marker
916
- annotation.audio_trims = None # Clear trims
917
- annotation.update_at = now
918
- # Potentially clear review statuses if deletion overrides them
919
- else:
920
- annotation = Annotation(
921
- annotator=CURRENT_USERNAME,
922
- annotated_subtitle=deleted_text_marker,
923
- create_at=now,
924
- update_at=now
925
- )
926
- sample.annotations = sample.annotations or []
927
- sample.annotations.append(annotation)
928
-
929
- save_annotations(dataset_model)
930
-
931
- new_status = f"Sample {absolute_idx+1} [Audio deleted]"
932
- if total_samples > 0: new_status += f" of {total_samples}"
933
-
934
- # Return values to update UI correctly after deletion
935
- return page_idx, idx_on_page, None, deleted_text_marker, "deleted", new_status, deleted_text_marker
936
-
937
-
938
- # Export functions (largely unchanged, ensure CURRENT_USERNAME context if it matters for export)
939
- def sanitize_string(s):
940
- if not isinstance(s, str): s = str(s)
941
- return re.sub(r'[^\w-./]', '_', s)
942
-
943
- def sanitize_sentence(s):
944
- if not isinstance(s, str): s = str(s)
945
- return s.encode('utf-8', errors='ignore').decode('utf-8')
946
-
947
- @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
948
- def push_to_hub_with_retry(dataset_dict, repo_id, private=True, token_val=None):
949
- if not token_val:
950
- print("Cannot push to hub: No token provided for push_to_hub_with_retry.")
951
- return
952
- print(f"Pushing dataset to {repo_id}")
953
- dataset_dict.push_to_hub(repo_id, private=private, token=token_val)
954
-
955
- def export_to_huggingface(repo_name_str, hf_token_for_export, progress=gr.Progress()):
956
- # This export logic needs to be carefully reviewed.
957
- # It rebuilds a dataset from HF_DATASET_NAME and applies annotations.
958
- # It should reflect the FINAL state of annotations (e.g., after second phase review if applicable).
959
- # The current logic uses CURRENT_USERNAME for annotation preference, which might not be ideal for a global export.
960
- # It should ideally use the "winning" annotation (e.g., accepted by reviewer, or approved in 2nd phase).
961
- if not hf_token_for_export:
962
- return "Export failed: Hugging Face token is missing."
963
- try:
964
- start_time = time.time()
965
- repo_name_str = sanitize_string(repo_name_str)
966
- print(f"Export started at {time.strftime('%Y-%m-%d %H:%M:%S')}")
967
-
968
- dataset_model_annotations = load_saved_annotations() # Load all annotations
969
-
970
- # Use total_samples from global or re-fetch if necessary.
971
- # The export should process all samples defined by total_samples.
972
- # Let's assume total_samples is the definitive count.
973
- if total_samples <= 0:
974
- return "Export failed: Total number of samples is unknown or invalid."
975
-
976
- # export_total_samples = total_samples
977
- # Using streaming for source, but collecting all data. This can be memory intensive.
978
- # Consider processing in true streaming fashion if dataset is very large.
979
-
980
- ds_source = load_dataset(HF_DATASET_NAME, split="train", streaming=False) # Load non-streaming for easier iteration up to total_samples
981
-
982
- exported_data_list = []
983
- progress(0, f"Preparing {total_samples} samples for export...")
984
-
985
- for i, source_sample in enumerate(ds_source):
986
- if i >= total_samples: break # Limit to known total_samples
987
-
988
- absolute_idx = i # Assuming source_sample is ordered and corresponds to index i
989
-
990
- audio_entry = source_sample.get("audio")
991
- sentence_val = source_sample.get("sentence", "") # Default original sentence
992
-
993
- # Determine final audio and sentence based on annotations
994
- audio_dict_to_export = None # Default to no audio if deleted or issue
995
-
996
- # Convert audio path/data from source_sample to array for export
997
- # This part is tricky: we need to load audio content.
998
- # For simplicity, this example will re-use get_audio_path and then load if it's a path.
999
- raw_audio_data = None
1000
- audio_path_or_data = get_audio_path(audio_entry)
1001
- if isinstance(audio_path_or_data, tuple): # Raw audio from get_audio_path
1002
- raw_audio_data = {"array": audio_path_or_data[1], "sampling_rate": audio_path_or_data[0]}
1003
- elif isinstance(audio_path_or_data, str) and (os.path.exists(audio_path_or_data) or audio_path_or_data.startswith("http")):
1004
- # If it's a path, load it. This might be slow.
1005
- # For URLs, datasets library handles loading when building Dataset object.
1006
- # For local paths, we need to load into array.
1007
- if os.path.exists(audio_path_or_data):
1008
- try:
1009
- arr, sr = sf.read(audio_path_or_data)
1010
- raw_audio_data = {"array": arr, "sampling_rate": sr}
1011
- except Exception as e_load:
1012
- print(f"Warning: Could not load audio file {audio_path_or_data} for export: {e_load}")
1013
- # raw_audio_data remains None
1014
- else: # URL
1015
- raw_audio_data = audio_path_or_data # Pass URL directly, Audio feature will handle
1016
-
1017
- audio_dict_to_export = raw_audio_data
1018
-
1019
-
1020
- # Check annotations for this sample
1021
- annotation_data = next((s for s in dataset_model_annotations.samples or [] if s.id == absolute_idx), None)
1022
-
1023
- if annotation_data:
1024
- if annotation_data.ignore_it:
1025
- sentence_val = "AUDIO DELETED (This audio has been removed.)"
1026
- audio_dict_to_export = None # No audio
1027
- else:
1028
- # Determine the "best" annotation to use
1029
- # Priority: 1. Approved in 2nd phase, 2. Accepted in 1st phase by reviewer, 3. Annotator's latest
1030
- best_ann = None
1031
- if annotation_data.annotations:
1032
- # Check for 2nd phase approved
1033
- # This needs to find the annotation that WAS approved, not make a new one.
1034
- # The original annotator's submission that got approved.
1035
- if annotation_data.is_approved_in_second_phase:
1036
- # Find which annotation was approved. Iterate through them.
1037
- for ann in annotation_data.annotations:
1038
- if ann.second_phase_review_status == "approved":
1039
- best_ann = ann
1040
- break
1041
-
1042
- if not best_ann: # Check for 1st phase accepted
1043
- for ann in annotation_data.annotations:
1044
- if ann.is_first_phase_accepted:
1045
- best_ann = ann
1046
- break
1047
-
1048
- if not best_ann: # Fallback to any annotation (e.g., latest by timestamp or first found)
1049
- # This could be more sophisticated, e.g. latest updated.
1050
- # For now, take first one if multiple non-reviewed/accepted exist.
1051
- # Or, if a specific user's annotations are primary (e.g. CURRENT_USERNAME if this is a personal export)
1052
- # Let's assume any relevant annotation is fine if not formally accepted/approved.
1053
- # The original code used CURRENT_USERNAME's annotation. This might be too specific for a general export.
1054
- # Let's try to find *any* annotation from the list for the sample if no "accepted" one exists.
1055
- if annotation_data.annotations:
1056
- best_ann = sorted(annotation_data.annotations, key=lambda x: x.update_at, reverse=True)[0] # latest
1057
-
1058
- if best_ann:
1059
- sentence_val = best_ann.annotated_subtitle or sentence_val # Use annotated if available
1060
- # Handle trimmed audio if specified in best_ann
1061
- if best_ann.audio_trims and audio_dict_to_export: # Only if audio exists
1062
- # This part requires that trimmed audio files are accessible and named consistently
1063
- # The original trim_audio_action saves to "trimmed_audio/trimmed_{abs_idx}_{voice_name}"
1064
- # We need to reconstruct this path or have a direct reference.
1065
- # Assuming voice_name is from original sample.
1066
- original_voice_name = sanitize_string(os.path.basename(str(get_audio_path(audio_entry) or f"sample_{absolute_idx}")))
1067
- trimmed_path_potential = os.path.join("trimmed_audio", f"trimmed_{absolute_idx}_{original_voice_name}")
1068
- # Ensure extension consistency for look up
1069
- if not os.path.splitext(trimmed_path_potential)[1]: trimmed_path_potential += ".wav" # common default
1070
-
1071
- if os.path.exists(trimmed_path_potential):
1072
- try:
1073
- arr, sr = sf.read(trimmed_path_potential)
1074
- audio_dict_to_export = {"array": arr, "sampling_rate": sr}
1075
- except Exception as e_trim_load:
1076
- print(f"Warning: Could not load trimmed audio {trimmed_path_potential}: {e_trim_load}")
1077
- # audio_dict_to_export remains as original loaded audio
1078
- # else: print(f"Trimmed audio path not found: {trimmed_path_potential}")
1079
-
1080
- exported_data_list.append({
1081
- "audio": audio_dict_to_export, # This will be None if deleted or failed to load
1082
- "sentence": sanitize_sentence(sentence_val)
1083
- })
1084
-
1085
- if (i + 1) % 100 == 0: # Progress update
1086
- progress((i + 1) / total_samples, f"Processed {i+1}/{total_samples} samples")
1087
- gc.collect()
1088
-
1089
- if not exported_data_list:
1090
- return "No data to export after processing."
1091
-
1092
- # Create Hugging Face Dataset from the collected data
1093
- # Filter out entries where audio is None if dataset schema requires audio
1094
- # final_export_list = [item for item in exported_data_list if item["audio"] is not None]
1095
- # Or handle audio being optional by schema. For Audio(), None might not be allowed if array is mandatory.
1096
- # Let's assume for now audio can be None (e.g. deleted). If Audio() cast fails, this needs adjustment.
1097
- # The Audio feature expects a path, dict with array/sr, or bytes. None might lead to issues.
1098
- # Handling: if audio_dict_to_export is None, replace with a dummy silent audio array or skip sample.
1099
- # For now, let's try passing None and see if cast_column handles it gracefully or errors.
1100
- # It's safer to ensure 'audio' is always a valid Audio structure or path.
1101
- # If audio is None (e.g. ignore_it=True), we should ensure the Audio feature can handle it.
1102
- # Typically, you might replace with a path to a very short silent audio file, or an empty array if supported.
1103
-
1104
- for item in exported_data_list:
1105
- if item["audio"] is None: # If audio was marked for deletion / ignore_it
1106
- # Provide a placeholder that Audio() can cast, e.g. path to a tiny silent wav or empty array
1107
- # For simplicity, if datasets lib allows None for audio feature, this is fine.
1108
- # Otherwise, this needs a robust placeholder.
1109
- # A common practice is to provide a dictionary with a path to a universally accessible silent file,
1110
- # or an empty numpy array for 'array' and a common 'sampling_rate'.
1111
- # Let's try with an empty array.
1112
- item["audio"] = {"array": np.array([], dtype=np.float32), "sampling_rate": 16000} # Example placeholder
1113
- elif isinstance(item["audio"], str): # If it's a URL or path string
1114
- # The Audio feature will handle loading this.
1115
- pass
1116
- elif not (isinstance(item["audio"], dict) and "array" in item["audio"] and "sampling_rate" in item["audio"]):
1117
- print(f"Warning: Invalid audio format for export for a sample, replacing with silent audio: {item['audio']}")
1118
- item["audio"] = {"array": np.array([], dtype=np.float32), "sampling_rate": 16000}
1119
-
1120
-
1121
- final_dataset = Dataset.from_list(exported_data_list)
1122
- final_dataset = final_dataset.cast_column("audio", Audio()) # Cast to Audio feature type
1123
-
1124
- dataset_dict_export = DatasetDict({"train": final_dataset})
1125
-
1126
- progress(0.95, "Uploading to Hugging Face...")
1127
- push_to_hub_with_retry(
1128
- dataset_dict=dataset_dict_export,
1129
- repo_id=repo_name_str,
1130
- private=True, # Assuming private, can be a parameter
1131
- token_val=hf_token_for_export
1132
- )
1133
- print(f"Upload done, total time: {time.time() - start_time:.2f}s")
1134
- progress(1.0, "Upload complete!")
1135
- return f"Exported to huggingface.co/datasets/{repo_name_str}"
1136
-
1137
- except Exception as e:
1138
- error_msg = f"Export failed: {str(e)}"
1139
- import traceback
1140
- print(f"{error_msg}\n{traceback.format_exc()}")
1141
- return error_msg
1142
-
1143
-
1144
- # Login function
1145
- def hf_login(hf_token_val):
1146
- global CURRENT_USERNAME, token, current_page_data, total_samples, annotator_ranges
1147
-
1148
- if not hf_token_val: # If user clears the box and clicks login
1149
- return gr.update(visible=True), gr.update(visible=False), "", "", "Login failed: Token cannot be empty."
1150
-
1151
- try:
1152
- user_info = whoami(token=hf_token_val)
1153
- username = user_info['name']
1154
-
1155
- if username in ALLOWED_USERS:
1156
- CURRENT_USERNAME = username
1157
- token = hf_token_val # Store the validated token globally for other HF ops
1158
-
1159
- # Initialize/re-initialize dataset info and ranges based on logged-in user
1160
- # This ensures that if total_samples was not fetched, it's attempted again.
1161
- ds_info = get_dataset_info() # Sets global total_samples
1162
- if total_samples > 0:
1163
- annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
1164
- if SECOND_PHASE:
1165
- initialize_second_phase_assignments() # Depends on ANNOTATORS and their ranges
1166
- else:
1167
- # Handle case where total_samples is still unknown (critical for ranges)
1168
- return gr.update(visible=True), gr.update(visible=False), "", hf_token_val, "Login successful, but failed to get dataset size. Cannot proceed."
1169
-
1170
-
1171
- # Load initial page data for this user
1172
- current_page_data = load_page_data(0) # page 0 for the current user
1173
-
1174
- # Determine initial UI state based on SECOND_PHASE
1175
- is_second_phase_active = SECOND_PHASE
1176
-
1177
- # Update visibility of components based on phase
1178
- updates = {
1179
- # Phase 1 components
1180
- "save_next_button_vis": not is_second_phase_active,
1181
- "transcript_interactive": not is_second_phase_active,
1182
- "trim_button_vis": not is_second_phase_active,
1183
- "undo_trim_button_vis": not is_second_phase_active,
1184
- "delete_button_vis": not is_second_phase_active,
1185
- "first_phase_accept_cb_vis": (not is_second_phase_active and get_user_role(CURRENT_USERNAME) == "reviewer"),
1186
- # Phase 2 components
1187
- "approve_button_vis": is_second_phase_active,
1188
- "reject_button_vis": is_second_phase_active,
1189
- }
1190
-
1191
- initial_load = load_interface_data(0, 0) # Load data for the first sample (page 0, index 0 on page)
1192
-
1193
- # Return tuple for outputs matching login_button.click()
1194
- # login_container, main_container, reviewer_textbox (as initial state), hf_token_state, login_message,
1195
- # then all the visibility/interactivity updates
1196
- return (
1197
- gr.update(visible=False), # login_container
1198
- gr.update(visible=True), # main_container
1199
- initial_load[4], # reviewer_textbox gr.update object (initial_load[4] is reviewer text gr.update)
1200
- hf_token_val, # hf_token_state
1201
- f"Login successful! Welcome {CURRENT_USERNAME}. Phase: {'Review' if SECOND_PHASE else 'Annotation'}.", # login_message
1202
-
1203
- # UI component updates based on phase
1204
- gr.update(visible=updates["save_next_button_vis"]),
1205
- gr.update(interactive=updates["transcript_interactive"]), # This is for transcript Textarea
1206
- gr.update(visible=updates["trim_button_vis"]),
1207
- gr.update(visible=updates["undo_trim_button_vis"]),
1208
- gr.update(visible=updates["delete_button_vis"]),
1209
- gr.update(visible=updates["first_phase_accept_cb_vis"]),
1210
- gr.update(visible=updates["approve_button_vis"]),
1211
- gr.update(visible=updates["reject_button_vis"]),
1212
-
1213
- # Initial data for the interface elements from load_interface_data
1214
- initial_load[0], # page_idx_state
1215
- initial_load[1], # idx_on_page_state
1216
- initial_load[2], # audio_player
1217
- initial_load[3], # transcript (already includes interactivity)
1218
- # initial_load[4] is reviewer, already used above for initial value
1219
- initial_load[5], # status_md
1220
- initial_load[6], # original_transcript_state
1221
- )
1222
-
1223
- else:
1224
- CURRENT_USERNAME = None
1225
- return gr.update(visible=True), gr.update(visible=False), "", hf_token_val, "User not authorized!"
1226
- except Exception as e:
1227
- CURRENT_USERNAME = None
1228
- import traceback
1229
- print(f"Login failed: {str(e)}\n{traceback.format_exc()}")
1230
- return gr.update(visible=True), gr.update(visible=False), "", hf_token_val, f"Login failed: {str(e)}"
1231
-
1232
-
1233
- # Set initial values for UI elements before login (mostly empty or default)
1234
- init_page_idx = 0
1235
- init_idx_on_page = 0
1236
- init_audio_val = None
1237
- init_transcript_val = gr.update(value="", interactive=False) # Non-interactive before login
1238
- init_reviewer_val = gr.update(value="N/A", interactive=False)
1239
- init_status_val = "Please log in."
1240
- init_original_text_val = ""
1241
-
1242
  # Gradio Interface
1243
  css = """
1244
  .white { background-color: white; color: black; }
@@ -1316,13 +75,10 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
1316
  save_next_button, transcript_tb, trim_button, undo_trim_button, delete_button,
1317
  first_phase_accept_cb, approve_button, reject_button,
1318
  # Initial data load updates
1319
- current_page_idx_state, current_idx_on_page_state, audio_player, transcript_tb, # transcript_tb updated twice, once for interactivity, once for value
1320
  status_md, original_transcript_state
1321
  ]
1322
- # Need to ensure transcript_tb gets value update from initial_load too.
1323
- # hf_login returns initial_load[3] which is gr.update(value=text, interactive=editable) for transcript.
1324
- # So, one update to transcript_tb should be sufficient if it carries both value and interactivity.
1325
-
1326
  login_button.click(
1327
  fn=hf_login,
1328
  inputs=[hf_token_input],
@@ -1341,17 +97,15 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
1341
  inputs=[current_page_idx_state, current_idx_on_page_state, transcript_tb, first_phase_accept_cb],
1342
  outputs=navigation_outputs
1343
  )
1344
- # 'Next (no save)' button (only for Phase 1)
1345
  next_button.click(
1346
- fn=go_next_sample_wrapper, # This simple nav doesn't save unsaved changes. User should be aware.
1347
  inputs=[current_page_idx_state, current_idx_on_page_state],
1348
  outputs=navigation_outputs
1349
- ).then( # Add a small JS to clear unsaved changes marker if any (conceptual)
1350
- None, None, None, _js="() => { /* Clear unsaved visual cues if any */ }"
1351
- )
1352
 
1353
  prev_button.click(
1354
- fn=go_prev_sample_wrapper, # Similarly, does not auto-save.
1355
  inputs=[current_page_idx_state, current_idx_on_page_state],
1356
  outputs=navigation_outputs
1357
  )
@@ -1359,12 +113,12 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
1359
  # Phase 2 actions
1360
  approve_button.click(
1361
  fn=review_and_next_sample_second_phase,
1362
- inputs=[current_page_idx_state, current_idx_on_page_state, gr.State("approved")], # Pass action string
1363
  outputs=navigation_outputs
1364
  )
1365
  reject_button.click(
1366
  fn=review_and_next_sample_second_phase,
1367
- inputs=[current_page_idx_state, current_idx_on_page_state, gr.State("rejected")], # Pass action string
1368
  outputs=navigation_outputs
1369
  )
1370
 
@@ -1372,15 +126,15 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
1372
  trim_button.click(
1373
  fn=trim_audio_action,
1374
  inputs=[current_page_idx_state, current_idx_on_page_state, trim_start_tb, trim_end_tb],
1375
- outputs=navigation_outputs # Outputs audio_player, status_md primarily
1376
  )
1377
  undo_trim_button.click(
1378
  fn=undo_trim_action,
1379
  inputs=[current_page_idx_state, current_idx_on_page_state],
1380
  outputs=navigation_outputs
1381
  )
1382
- delete_button.click( # This will be a confirmable action
1383
- fn=confirm_delete_audio_action, # Direct action for simplicity, could add confirmation dialog
1384
  inputs=[current_page_idx_state, current_idx_on_page_state],
1385
  outputs=navigation_outputs
1386
  )
@@ -1395,17 +149,14 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
1395
  fn=export_to_huggingface,
1396
  inputs=[hf_repo_name_tb, hf_token_state],
1397
  outputs=[hf_export_status_md],
1398
- queue=True # Export can be long
1399
  )
1400
 
1401
  # Launch the interface
1402
  if __name__ == "__main__":
1403
- # For testing, you might want to set SECOND_PHASE here or via environment variable
1404
- # Example: os.environ.get("APP_SECOND_PHASE", "False").lower() == "true"
1405
- # SECOND_PHASE = True # Force second phase for testing
1406
  if SECOND_PHASE:
1407
  print("==== APPLICATION RUNNING IN SECOND PHASE (REVIEW MODE) ====")
1408
  else:
1409
  print("==== APPLICATION RUNNING IN FIRST PHASE (ANNOTATION MODE) ====")
1410
 
1411
- demo.queue().launch(debug=True, share=False) # Share=True for ngrok link if needed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Gradio Interface
2
  css = """
3
  .white { background-color: white; color: black; }
 
75
  save_next_button, transcript_tb, trim_button, undo_trim_button, delete_button,
76
  first_phase_accept_cb, approve_button, reject_button,
77
  # Initial data load updates
78
+ current_page_idx_state, current_idx_on_page_state, audio_player, transcript_tb,
79
  status_md, original_transcript_state
80
  ]
81
+
 
 
 
82
  login_button.click(
83
  fn=hf_login,
84
  inputs=[hf_token_input],
 
97
  inputs=[current_page_idx_state, current_idx_on_page_state, transcript_tb, first_phase_accept_cb],
98
  outputs=navigation_outputs
99
  )
100
+
101
  next_button.click(
102
+ fn=go_next_sample_wrapper,
103
  inputs=[current_page_idx_state, current_idx_on_page_state],
104
  outputs=navigation_outputs
105
+ ) # REMOVED the problematic .then() call here
 
 
106
 
107
  prev_button.click(
108
+ fn=go_prev_sample_wrapper,
109
  inputs=[current_page_idx_state, current_idx_on_page_state],
110
  outputs=navigation_outputs
111
  )
 
113
  # Phase 2 actions
114
  approve_button.click(
115
  fn=review_and_next_sample_second_phase,
116
+ inputs=[current_page_idx_state, current_idx_on_page_state, gr.State("approved")],
117
  outputs=navigation_outputs
118
  )
119
  reject_button.click(
120
  fn=review_and_next_sample_second_phase,
121
+ inputs=[current_page_idx_state, current_idx_on_page_state, gr.State("rejected")],
122
  outputs=navigation_outputs
123
  )
124
 
 
126
  trim_button.click(
127
  fn=trim_audio_action,
128
  inputs=[current_page_idx_state, current_idx_on_page_state, trim_start_tb, trim_end_tb],
129
+ outputs=navigation_outputs
130
  )
131
  undo_trim_button.click(
132
  fn=undo_trim_action,
133
  inputs=[current_page_idx_state, current_idx_on_page_state],
134
  outputs=navigation_outputs
135
  )
136
+ delete_button.click(
137
+ fn=confirm_delete_audio_action,
138
  inputs=[current_page_idx_state, current_idx_on_page_state],
139
  outputs=navigation_outputs
140
  )
 
149
  fn=export_to_huggingface,
150
  inputs=[hf_repo_name_tb, hf_token_state],
151
  outputs=[hf_export_status_md],
152
+ queue=True
153
  )
154
 
155
  # Launch the interface
156
  if __name__ == "__main__":
 
 
 
157
  if SECOND_PHASE:
158
  print("==== APPLICATION RUNNING IN SECOND PHASE (REVIEW MODE) ====")
159
  else:
160
  print("==== APPLICATION RUNNING IN FIRST PHASE (ANNOTATION MODE) ====")
161
 
162
+ demo.queue().launch(debug=True, share=False)