Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,1244 +1,3 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import os
|
3 |
-
import json
|
4 |
-
import pandas as pd
|
5 |
-
from datasets import load_dataset, DatasetDict, Dataset, Audio
|
6 |
-
from huggingface_hub import HfApi, whoami, login, hf_hub_download
|
7 |
-
import tempfile
|
8 |
-
import shutil
|
9 |
-
import gc
|
10 |
-
import time
|
11 |
-
import psutil
|
12 |
-
from pydub import AudioSegment
|
13 |
-
import soundfile as sf
|
14 |
-
from tenacity import retry, stop_after_attempt, wait_exponential
|
15 |
-
import re
|
16 |
-
import numpy as np
|
17 |
-
from pydantic import BaseModel
|
18 |
-
from typing import Optional, List, Tuple
|
19 |
-
from datetime import datetime
|
20 |
-
|
21 |
-
# Log in with Hugging Face token
|
22 |
-
token = os.getenv("hf_token")
|
23 |
-
if token:
|
24 |
-
login(token)
|
25 |
-
else:
|
26 |
-
print("Warning: hf_token environment variable not set. Hugging Face Hub operations might fail.")
|
27 |
-
|
28 |
-
# Configuration
|
29 |
-
HF_DATASET_NAME = "navidved/channelb-raw-data"
|
30 |
-
AUDIO_DIR = "audio"
|
31 |
-
SAVE_PATH = "annotations.json"
|
32 |
-
ALLOWED_USERS = ["vargha", "navidved", "userC"] # Added userC for testing 2nd phase with >1 annotator
|
33 |
-
REVIEWERS = ["vargha"] # First phase reviewers
|
34 |
-
ANNOTATORS = [user for user in ALLOWED_USERS if user not in REVIEWERS] # First phase annotators
|
35 |
-
CURRENT_USERNAME = None
|
36 |
-
PAGE_SIZE = 100 # Kept for pagination logic, though review might be sample by sample
|
37 |
-
SAVE_INTERVAL = 10
|
38 |
-
|
39 |
-
# --- SECOND PHASE CONFIGURATION ---
|
40 |
-
SECOND_PHASE = False # Set to True to activate second phase review
|
41 |
-
SECOND_PHASE_REVIEW_MAPPING = {} # Populated if SECOND_PHASE is True. Maps: reviewer_username -> original_annotator_username
|
42 |
-
# Example: {"navidved": "userC"} means navidved reviews userC's work
|
43 |
-
|
44 |
-
# Global state variables
|
45 |
-
current_page = 0
|
46 |
-
ds_iter = None
|
47 |
-
current_page_data = None
|
48 |
-
audio_backup = {}
|
49 |
-
annotation_count = 0
|
50 |
-
unsaved_changes = {} # Primarily for first phase
|
51 |
-
total_samples = 0
|
52 |
-
annotator_ranges = {} # Stores {annotator_username: (start_idx, end_idx)} for first phase
|
53 |
-
|
54 |
-
# Pydantic data models
|
55 |
-
class AudioTrim(BaseModel):
|
56 |
-
start: float
|
57 |
-
end: float
|
58 |
-
|
59 |
-
class Annotation(BaseModel):
|
60 |
-
annotator: str # Original annotator (first phase)
|
61 |
-
annotated_subtitle: Optional[str] = None
|
62 |
-
audio_trims: Optional[List[AudioTrim]] = None
|
63 |
-
|
64 |
-
# First phase review fields
|
65 |
-
is_first_phase_accepted: bool = False
|
66 |
-
first_phase_reviewer_username: Optional[str] = None
|
67 |
-
|
68 |
-
# Second phase review fields
|
69 |
-
second_phase_reviewed_by: Optional[str] = None
|
70 |
-
second_phase_review_status: Optional[str] = None # "approved" or "rejected"
|
71 |
-
second_phase_review_timestamp: Optional[datetime] = None
|
72 |
-
|
73 |
-
create_at: datetime
|
74 |
-
update_at: datetime
|
75 |
-
|
76 |
-
class Sample(BaseModel):
|
77 |
-
id: int
|
78 |
-
voice_name: str
|
79 |
-
original_subtitle: str
|
80 |
-
ignore_it: bool = False
|
81 |
-
description: Optional[str] = None
|
82 |
-
annotations: Optional[List[Annotation]] = None
|
83 |
-
is_approved_in_second_phase: bool = False # True if the primary annotation is approved in 2nd phase
|
84 |
-
|
85 |
-
class DatasetModel(BaseModel): # Renamed to avoid conflict with datasets.Dataset
|
86 |
-
samples: Optional[List[Sample]] = None
|
87 |
-
|
88 |
-
# Utility functions
|
89 |
-
def load_saved_annotations():
|
90 |
-
dataset_model = None
|
91 |
-
if os.path.exists(SAVE_PATH):
|
92 |
-
try:
|
93 |
-
with open(SAVE_PATH, "r", encoding="utf-8") as f:
|
94 |
-
data = json.load(f)
|
95 |
-
dataset_model = DatasetModel(**data)
|
96 |
-
print("Loaded annotations from local JSON file")
|
97 |
-
except Exception as e:
|
98 |
-
print(f"Error loading local JSON file: {str(e)}. Removing invalid file.")
|
99 |
-
# os.remove(SAVE_PATH) # Be cautious with auto-removing
|
100 |
-
dataset_model = None
|
101 |
-
|
102 |
-
|
103 |
-
if dataset_model is None and token:
|
104 |
-
try:
|
105 |
-
hf_path = hf_hub_download(
|
106 |
-
repo_id=HF_DATASET_NAME,
|
107 |
-
filename=SAVE_PATH,
|
108 |
-
repo_type="dataset",
|
109 |
-
token=token
|
110 |
-
)
|
111 |
-
with open(hf_path, "r", encoding="utf-8") as f:
|
112 |
-
data = json.load(f)
|
113 |
-
dataset_model = DatasetModel(**data)
|
114 |
-
# Cache it locally
|
115 |
-
with open(SAVE_PATH, "w", encoding="utf-8") as f:
|
116 |
-
f.write(dataset_model.model_dump_json(exclude_none=True, indent=4))
|
117 |
-
print("Loaded annotations from HF dataset repository and cached locally")
|
118 |
-
except Exception as e:
|
119 |
-
print(f"Error loading JSON file from HF repo: {str(e)}")
|
120 |
-
dataset_model = None
|
121 |
-
|
122 |
-
if dataset_model is None:
|
123 |
-
dataset_model = DatasetModel(samples=[])
|
124 |
-
print("Created new empty DatasetModel for annotations")
|
125 |
-
|
126 |
-
return dataset_model
|
127 |
-
|
128 |
-
def save_annotations(dataset_model: DatasetModel):
|
129 |
-
global annotation_count
|
130 |
-
try:
|
131 |
-
with open(SAVE_PATH, "w", encoding="utf-8") as f:
|
132 |
-
f.write(dataset_model.model_dump_json(exclude_none=True, indent=4))
|
133 |
-
print(f"Saved annotations to {SAVE_PATH}")
|
134 |
-
annotation_count += 1
|
135 |
-
if annotation_count % SAVE_INTERVAL == 0 and token:
|
136 |
-
push_json_to_hf()
|
137 |
-
except Exception as e:
|
138 |
-
print(f"Error saving annotations: {str(e)}")
|
139 |
-
|
140 |
-
def push_json_to_hf():
|
141 |
-
if not token:
|
142 |
-
print("Cannot push to HF: token not available.")
|
143 |
-
return
|
144 |
-
try:
|
145 |
-
api = HfApi()
|
146 |
-
api.upload_file(
|
147 |
-
path_or_fileobj=SAVE_PATH,
|
148 |
-
path_in_repo=SAVE_PATH,
|
149 |
-
repo_type="dataset",
|
150 |
-
repo_id=HF_DATASET_NAME,
|
151 |
-
token=token
|
152 |
-
)
|
153 |
-
print("Uploaded annotations.json to Hugging Face repository")
|
154 |
-
except Exception as e:
|
155 |
-
print(f"Error uploading JSON file: {str(e)}")
|
156 |
-
|
157 |
-
def calculate_annotator_ranges(total_samples_val, annotators_list):
|
158 |
-
num_annotators = len(annotators_list)
|
159 |
-
if num_annotators == 0 or total_samples_val <= 0:
|
160 |
-
return {}
|
161 |
-
|
162 |
-
samples_per_annotator = total_samples_val // num_annotators
|
163 |
-
extra_samples = total_samples_val % num_annotators
|
164 |
-
|
165 |
-
ranges = {}
|
166 |
-
start = 0
|
167 |
-
for i, annotator in enumerate(annotators_list):
|
168 |
-
end = start + samples_per_annotator - 1
|
169 |
-
if i < extra_samples:
|
170 |
-
end += 1
|
171 |
-
if end >= total_samples_val: # Ensure end does not exceed total_samples
|
172 |
-
end = total_samples_val -1
|
173 |
-
if start <= end : # Ensure start is not greater than end
|
174 |
-
ranges[annotator] = (start, end)
|
175 |
-
start = end + 1
|
176 |
-
return ranges
|
177 |
-
|
178 |
-
def initialize_second_phase_assignments():
|
179 |
-
global SECOND_PHASE_REVIEW_MAPPING, annotator_ranges
|
180 |
-
if not ANNOTATORS or len(ANNOTATORS) < 1: # Requires at least 1 annotator to review their own work, or 2 for cross-review
|
181 |
-
print("Not enough annotators for second phase review.")
|
182 |
-
SECOND_PHASE_REVIEW_MAPPING = {}
|
183 |
-
return
|
184 |
-
|
185 |
-
# Ensure annotator_ranges is populated
|
186 |
-
if not annotator_ranges and total_samples > 0:
|
187 |
-
print("Populating annotator_ranges for second phase initialization.")
|
188 |
-
annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
|
189 |
-
|
190 |
-
|
191 |
-
if len(ANNOTATORS) == 1:
|
192 |
-
# Single annotator reviews their own work if that's the desired logic
|
193 |
-
# Or, this phase might not apply. For now, let's allow self-review.
|
194 |
-
annotator = ANNOTATORS[0]
|
195 |
-
SECOND_PHASE_REVIEW_MAPPING[annotator] = annotator
|
196 |
-
print(f"Second phase: {annotator} will review their own work.")
|
197 |
-
else:
|
198 |
-
# Cyclic assignment: annotator[i] reviews annotator[i-1]'s work
|
199 |
-
for i, reviewer_user in enumerate(ANNOTATORS):
|
200 |
-
original_annotator_idx = (i - 1 + len(ANNOTATORS)) % len(ANNOTATORS)
|
201 |
-
original_annotator_user = ANNOTATORS[original_annotator_idx]
|
202 |
-
SECOND_PHASE_REVIEW_MAPPING[reviewer_user] = original_annotator_user
|
203 |
-
print(f"Second phase: {reviewer_user} will review {original_annotator_user}'s work.")
|
204 |
-
|
205 |
-
# Verify that original annotators have ranges
|
206 |
-
for reviewer, original_annotator in SECOND_PHASE_REVIEW_MAPPING.items():
|
207 |
-
if original_annotator not in annotator_ranges:
|
208 |
-
print(f"Warning: Original annotator {original_annotator} has no range defined in annotator_ranges.")
|
209 |
-
# This could happen if total_samples was 0 or annotator_ranges wasn't calculated correctly.
|
210 |
-
|
211 |
-
def get_user_allowed_range(username):
|
212 |
-
global annotator_ranges, total_samples
|
213 |
-
if SECOND_PHASE:
|
214 |
-
if not SECOND_PHASE_REVIEW_MAPPING: # Ensure it's initialized
|
215 |
-
initialize_second_phase_assignments()
|
216 |
-
|
217 |
-
original_annotator_to_review = SECOND_PHASE_REVIEW_MAPPING.get(username)
|
218 |
-
if original_annotator_to_review:
|
219 |
-
# The user `username` is reviewing `original_annotator_to_review`'s work.
|
220 |
-
# The range is the original work range of `original_annotator_to_review`.
|
221 |
-
if not annotator_ranges and total_samples > 0: # Lazy init for ranges if needed
|
222 |
-
annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
|
223 |
-
|
224 |
-
return annotator_ranges.get(original_annotator_to_review)
|
225 |
-
else: # User is not a designated reviewer in the second phase mapping
|
226 |
-
return None # Or (0,-1) to signify no access
|
227 |
-
else: # First Phase Logic
|
228 |
-
if get_user_role(username) == "reviewer": # First phase reviewers see everything
|
229 |
-
return (0, total_samples - 1) if total_samples > 0 else None
|
230 |
-
elif username in annotator_ranges: # First phase annotators see their assigned range
|
231 |
-
return annotator_ranges[username]
|
232 |
-
else:
|
233 |
-
return None
|
234 |
-
|
235 |
-
def is_within_range(absolute_idx, allowed_range):
|
236 |
-
if allowed_range is None:
|
237 |
-
return False
|
238 |
-
return allowed_range[0] <= absolute_idx <= allowed_range[1]
|
239 |
-
|
240 |
-
def get_user_role(username): # This defines first-phase roles
|
241 |
-
return "reviewer" if username in REVIEWERS else "annotator"
|
242 |
-
|
243 |
-
def init_dataset_iterator():
|
244 |
-
global ds_iter
|
245 |
-
try:
|
246 |
-
# It's better to load the dataset on demand rather than keeping an iterator open.
|
247 |
-
# For streaming, iter(load_dataset(...)) is fine if used immediately.
|
248 |
-
# ds = load_dataset(HF_DATASET_NAME, split="train", streaming=True)
|
249 |
-
# ds_iter = iter(ds)
|
250 |
-
return True
|
251 |
-
except Exception as e:
|
252 |
-
print(f"Error initializing dataset iterator: {e}")
|
253 |
-
return False
|
254 |
-
|
255 |
-
def load_page_data(page_num=0):
|
256 |
-
global current_page_data, current_page, total_samples
|
257 |
-
|
258 |
-
# For streaming, we re-fetch and skip.
|
259 |
-
try:
|
260 |
-
ds = load_dataset(HF_DATASET_NAME, split="train", streaming=True)
|
261 |
-
temp_ds_iter = iter(ds)
|
262 |
-
except Exception as e:
|
263 |
-
print(f"Error loading dataset for page data: {e}")
|
264 |
-
current_page_data = pd.DataFrame(columns=["audio", "sentence", "id_within_page"])
|
265 |
-
return current_page_data
|
266 |
-
|
267 |
-
# Determine the actual range of samples the user can see
|
268 |
-
# This needs to be based on the full dataset indices, not just page logic
|
269 |
-
allowed_range = get_user_allowed_range(CURRENT_USERNAME)
|
270 |
-
if not allowed_range:
|
271 |
-
print(f"User {CURRENT_USERNAME} has no allowed range.")
|
272 |
-
current_page_data = pd.DataFrame(columns=["audio", "sentence", "id_within_page"])
|
273 |
-
return current_page_data
|
274 |
-
|
275 |
-
# Calculate start and end absolute indices for the requested page, clipped by allowed_range
|
276 |
-
page_start_abs_idx = page_num * PAGE_SIZE
|
277 |
-
page_end_abs_idx = page_start_abs_idx + PAGE_SIZE - 1
|
278 |
-
|
279 |
-
# Effective start and end for fetching, considering user's total allowed range
|
280 |
-
effective_start_idx = max(page_start_abs_idx, allowed_range[0])
|
281 |
-
effective_end_idx = min(page_end_abs_idx, allowed_range[1])
|
282 |
-
|
283 |
-
samples_on_page = []
|
284 |
-
current_absolute_idx = 0
|
285 |
-
|
286 |
-
# Iterate through the dataset to find samples within the effective range for this page
|
287 |
-
# This can be slow for large datasets and large page_num with streaming.
|
288 |
-
# A non-streaming dataset or a more optimized way to seek would be better for large scale.
|
289 |
-
|
290 |
-
idx_counter_for_page = 0
|
291 |
-
for i, sample_data in enumerate(temp_ds_iter):
|
292 |
-
current_absolute_idx = i # Absolute index in the full dataset
|
293 |
-
|
294 |
-
if current_absolute_idx > effective_end_idx :
|
295 |
-
break # Past the samples needed for this page and user range
|
296 |
-
|
297 |
-
if current_absolute_idx >= effective_start_idx:
|
298 |
-
# This sample is within the user's allowed range and on the current conceptual page
|
299 |
-
sample_data['absolute_idx'] = current_absolute_idx
|
300 |
-
sample_data['id_within_page'] = idx_counter_for_page # relative index on current page view
|
301 |
-
samples_on_page.append(sample_data)
|
302 |
-
idx_counter_for_page +=1
|
303 |
-
if len(samples_on_page) >= PAGE_SIZE : # Filled the page
|
304 |
-
break
|
305 |
-
|
306 |
-
current_page = page_num
|
307 |
-
if samples_on_page:
|
308 |
-
current_page_data = pd.DataFrame(samples_on_page)
|
309 |
-
else:
|
310 |
-
# If no samples found (e.g., page is outside effective range)
|
311 |
-
current_page_data = pd.DataFrame(columns=["audio", "sentence", "id_within_page", "absolute_idx"])
|
312 |
-
print(f"No samples found for user {CURRENT_USERNAME} on page {page_num} within effective range {effective_start_idx}-{effective_end_idx}")
|
313 |
-
|
314 |
-
gc.collect()
|
315 |
-
return current_page_data
|
316 |
-
|
317 |
-
|
318 |
-
def get_dataset_info():
|
319 |
-
global total_samples # Use global total_samples
|
320 |
-
if total_samples > 0: # If already fetched
|
321 |
-
return {'num_samples': total_samples}
|
322 |
-
try:
|
323 |
-
# Temporarily load to get info, can be slow for huge datasets if not streaming
|
324 |
-
# For streaming, num_examples might be None or -1, so actual iteration might be needed
|
325 |
-
info = load_dataset(HF_DATASET_NAME, streaming=True, split="train").info
|
326 |
-
# The 'num_examples' for a streaming dataset split might not be accurate or available.
|
327 |
-
# It's often -1 or None. You might need a way to get the true total count if it's crucial.
|
328 |
-
# For now, we'll use it if available, otherwise, it remains a challenge for pure streaming.
|
329 |
-
if hasattr(info, 'estimated_size') and info.estimated_size is not None: # Check an alternative if num_examples is not good
|
330 |
-
pass # Not directly number of samples
|
331 |
-
|
332 |
-
# Fallback: iterate to count if num_examples is not reliable
|
333 |
-
# This is very inefficient and should be avoided if possible.
|
334 |
-
# A pre-calculated count or a different dataset split might be needed.
|
335 |
-
# For this example, we'll assume info.splits['train'].num_examples is somewhat usable
|
336 |
-
# or that a fixed total_samples is set if this is problematic.
|
337 |
-
|
338 |
-
# Simplified: try to get from info, but acknowledge limitations
|
339 |
-
ds_info_obj = load_dataset(HF_DATASET_NAME, split="train") # Load non-streaming for info
|
340 |
-
num_samples_val = ds_info_obj.num_rows
|
341 |
-
if num_samples_val and num_samples_val > 0:
|
342 |
-
total_samples = num_samples_val
|
343 |
-
return {'num_samples': total_samples}
|
344 |
-
|
345 |
-
# If still no count, this is an issue for range calculations.
|
346 |
-
# For now, return -1, but this will break range logic.
|
347 |
-
print("Warning: Could not reliably determine total_samples from dataset info.")
|
348 |
-
return {'num_samples': -1}
|
349 |
-
|
350 |
-
except Exception as e:
|
351 |
-
print(f"Error getting dataset info: {e}")
|
352 |
-
return {'num_samples': -1}
|
353 |
-
|
354 |
-
|
355 |
-
# Initial data load (moved after functions it calls are defined)
|
356 |
-
# init_dataset_iterator() # Iterator not maintained globally anymore for streaming robustness
|
357 |
-
dataset_info = get_dataset_info() # This sets global total_samples
|
358 |
-
if total_samples > 0:
|
359 |
-
annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
|
360 |
-
if SECOND_PHASE:
|
361 |
-
initialize_second_phase_assignments() # Initialize after annotator_ranges might be populated
|
362 |
-
else:
|
363 |
-
print("Warning: total_samples is not positive. Annotation ranges and second phase assignments may be incorrect.")
|
364 |
-
annotator_ranges = {}
|
365 |
-
|
366 |
-
# Load first page data for the initial user if any
|
367 |
-
# This should happen after login when CURRENT_USERNAME is set.
|
368 |
-
# current_page_data = load_page_data(0) # Moved to hf_login success path
|
369 |
-
|
370 |
-
|
371 |
-
def get_audio_path(audio_entry):
|
372 |
-
if isinstance(audio_entry, dict):
|
373 |
-
if "array" in audio_entry and "sampling_rate" in audio_entry:
|
374 |
-
return (audio_entry["sampling_rate"], audio_entry["array"]) # Return tuple for direct use
|
375 |
-
return audio_entry.get("path", None)
|
376 |
-
if isinstance(audio_entry, str):
|
377 |
-
if audio_entry.startswith("http://") or audio_entry.startswith("https://"):
|
378 |
-
return audio_entry # URL
|
379 |
-
if os.path.exists(audio_entry): # Absolute path
|
380 |
-
return audio_entry
|
381 |
-
# Relative path (try joining with AUDIO_DIR if one is configured)
|
382 |
-
if AUDIO_DIR:
|
383 |
-
joined_path = os.path.join(AUDIO_DIR, audio_entry)
|
384 |
-
if os.path.exists(joined_path):
|
385 |
-
return joined_path
|
386 |
-
return audio_entry # Return as is, might be a relative path resolvable by datasets
|
387 |
-
return None # Or handle unknown type
|
388 |
-
|
389 |
-
# Core functions
|
390 |
-
def save_sample_data(page_idx, idx_on_page, transcript, current_user_performing_action, accepted_flag=False):
|
391 |
-
global current_page_data, unsaved_changes
|
392 |
-
|
393 |
-
if current_page_data is None or idx_on_page >= len(current_page_data):
|
394 |
-
return "Invalid index or data not loaded for current page."
|
395 |
-
|
396 |
-
actual_sample_info = current_page_data.iloc[idx_on_page]
|
397 |
-
absolute_idx = actual_sample_info['absolute_idx']
|
398 |
-
|
399 |
-
# First phase saving logic
|
400 |
-
allowed_range = get_user_allowed_range(current_user_performing_action)
|
401 |
-
if not is_within_range(absolute_idx, allowed_range) and not SECOND_PHASE: # In 2nd phase, this check is implicitly handled by page loading
|
402 |
-
return "You are not allowed to annotate this sample (out of range)."
|
403 |
-
|
404 |
-
audio_entry_original = actual_sample_info["audio"] # This might be path or dict
|
405 |
-
voice_name = os.path.basename(str(get_audio_path(audio_entry_original) or f"sample_{absolute_idx}"))
|
406 |
-
|
407 |
-
dataset_model = load_saved_annotations()
|
408 |
-
sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
|
409 |
-
|
410 |
-
if not sample:
|
411 |
-
sample = Sample(
|
412 |
-
id=absolute_idx,
|
413 |
-
voice_name=voice_name,
|
414 |
-
original_subtitle=actual_sample_info["sentence"],
|
415 |
-
annotations=[]
|
416 |
-
)
|
417 |
-
dataset_model.samples = dataset_model.samples or []
|
418 |
-
dataset_model.samples.append(sample)
|
419 |
-
|
420 |
-
now = datetime.now()
|
421 |
-
# In the first phase, current_user_performing_action is the annotator or reviewer.
|
422 |
-
# 'accepted_flag' is used if current_user_performing_action is a first-phase reviewer.
|
423 |
-
annotation = next((a for a in sample.annotations or [] if a.annotator == current_user_performing_action), None)
|
424 |
-
|
425 |
-
if annotation:
|
426 |
-
annotation.annotated_subtitle = transcript.strip()
|
427 |
-
annotation.update_at = now
|
428 |
-
if get_user_role(current_user_performing_action) == "reviewer": # First phase reviewer
|
429 |
-
annotation.is_first_phase_accepted = accepted_flag
|
430 |
-
annotation.first_phase_reviewer_username = current_user_performing_action if accepted_flag else None
|
431 |
-
else:
|
432 |
-
new_annotation_data = {
|
433 |
-
"annotator": current_user_performing_action,
|
434 |
-
"annotated_subtitle": transcript.strip(),
|
435 |
-
"create_at": now,
|
436 |
-
"update_at": now,
|
437 |
-
"is_first_phase_accepted": False # Default
|
438 |
-
}
|
439 |
-
if get_user_role(current_user_performing_action) == "reviewer":
|
440 |
-
new_annotation_data["is_first_phase_accepted"] = accepted_flag
|
441 |
-
if accepted_flag:
|
442 |
-
new_annotation_data["first_phase_reviewer_username"] = current_user_performing_action
|
443 |
-
|
444 |
-
annotation = Annotation(**new_annotation_data)
|
445 |
-
sample.annotations = sample.annotations or []
|
446 |
-
sample.annotations.append(annotation)
|
447 |
-
|
448 |
-
if absolute_idx in unsaved_changes:
|
449 |
-
del unsaved_changes[absolute_idx]
|
450 |
-
|
451 |
-
save_annotations(dataset_model)
|
452 |
-
return f"✓ Saved annotation for sample {absolute_idx}"
|
453 |
-
|
454 |
-
def handle_second_phase_action(page_idx, idx_on_page, action: str): # action is "approved" or "rejected"
|
455 |
-
global current_page_data, CURRENT_USERNAME
|
456 |
-
|
457 |
-
if not SECOND_PHASE:
|
458 |
-
return "Not in second phase."
|
459 |
-
if current_page_data is None or idx_on_page >= len(current_page_data):
|
460 |
-
return "Invalid index or data not loaded for current page (second phase)."
|
461 |
-
|
462 |
-
actual_sample_info = current_page_data.iloc[idx_on_page]
|
463 |
-
absolute_idx = actual_sample_info['absolute_idx']
|
464 |
-
|
465 |
-
original_annotator_to_review = SECOND_PHASE_REVIEW_MAPPING.get(CURRENT_USERNAME)
|
466 |
-
if not original_annotator_to_review:
|
467 |
-
return "You are not assigned to review any user's work."
|
468 |
-
|
469 |
-
dataset_model = load_saved_annotations()
|
470 |
-
sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
|
471 |
-
if not sample:
|
472 |
-
# This case should ideally not happen if data is consistent.
|
473 |
-
# If it does, it means the sample exists in source dataset but not annotations.json.
|
474 |
-
# A reviewer in 2nd phase is reviewing existing annotation.
|
475 |
-
return f"Error: Sample {absolute_idx} not found in annotations.json for review."
|
476 |
-
|
477 |
-
# Find the annotation made by the original_annotator_to_review
|
478 |
-
annotation_to_review = next((ann for ann in sample.annotations or [] if ann.annotator == original_annotator_to_review), None)
|
479 |
-
|
480 |
-
if not annotation_to_review:
|
481 |
-
# If original annotator did not make an annotation for this sample.
|
482 |
-
# Option 1: Create a placeholder annotation based on original_subtitle and review that.
|
483 |
-
# Option 2: Report error. For now, report error.
|
484 |
-
# This implies the first phase annotator skipped this item or it wasn't in their range correctly.
|
485 |
-
print(f"Warning: No prior annotation by {original_annotator_to_review} for sample {absolute_idx}. Reviewing original subtitle implicitly.")
|
486 |
-
# Let's create one if missing, based on original subtitle
|
487 |
-
annotation_to_review = Annotation(
|
488 |
-
annotator=original_annotator_to_review,
|
489 |
-
annotated_subtitle=sample.original_subtitle, # Use original subtitle
|
490 |
-
create_at=sample.annotations[0].create_at if sample.annotations else datetime.now(), # Approx original creation
|
491 |
-
update_at=datetime.now()
|
492 |
-
)
|
493 |
-
sample.annotations = sample.annotations or []
|
494 |
-
sample.annotations.append(annotation_to_review)
|
495 |
-
|
496 |
-
|
497 |
-
annotation_to_review.second_phase_reviewed_by = CURRENT_USERNAME
|
498 |
-
annotation_to_review.second_phase_review_status = action
|
499 |
-
annotation_to_review.second_phase_review_timestamp = datetime.now()
|
500 |
-
annotation_to_review.update_at = datetime.now()
|
501 |
-
|
502 |
-
if action == "approved":
|
503 |
-
sample.is_approved_in_second_phase = True
|
504 |
-
# If rejected, is_approved_in_second_phase could be set to False, or depend on other conditions.
|
505 |
-
# For now, only explicit approval sets it to True.
|
506 |
-
|
507 |
-
save_annotations(dataset_model)
|
508 |
-
return f"✓ Review ({action}) saved for sample {absolute_idx} (Original annotator: {original_annotator_to_review})"
|
509 |
-
|
510 |
-
|
511 |
-
def get_sample(page_idx, idx_on_page, current_user_displaying): # current_user_displaying is CURRENT_USERNAME
|
512 |
-
global current_page_data, unsaved_changes, total_samples
|
513 |
-
|
514 |
-
if current_page_data is None or idx_on_page < 0 or idx_on_page >= len(current_page_data):
|
515 |
-
return None, "", f"Invalid index. Range is 0-{len(current_page_data)-1}", "unreviewed", "white", True, False, ""
|
516 |
-
|
517 |
-
actual_sample_info = current_page_data.iloc[idx_on_page]
|
518 |
-
absolute_idx = actual_sample_info['absolute_idx']
|
519 |
-
|
520 |
-
audio_entry_original = actual_sample_info["audio"]
|
521 |
-
audio_val = get_audio_path(audio_entry_original)
|
522 |
-
|
523 |
-
default_transcript = actual_sample_info["sentence"]
|
524 |
-
transcript_to_display = default_transcript
|
525 |
-
|
526 |
-
# UI states
|
527 |
-
ui_reviewer_field = "unreviewed" # Textbox showing who annotated/reviewed
|
528 |
-
ui_color = "white"
|
529 |
-
ui_editable = True # Transcript text area
|
530 |
-
ui_is_accepted_flag = False # For first phase checkmark logic, or second phase display
|
531 |
-
ui_status_message = f"Sample {absolute_idx+1}"
|
532 |
-
if total_samples > 0:
|
533 |
-
ui_status_message += f" of {total_samples}"
|
534 |
-
|
535 |
-
dataset_model = load_saved_annotations()
|
536 |
-
sample_from_json = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
|
537 |
-
|
538 |
-
if sample_from_json:
|
539 |
-
if sample_from_json.ignore_it:
|
540 |
-
audio_val = None
|
541 |
-
transcript_to_display = "AUDIO DELETED (This audio has been removed.)"
|
542 |
-
ui_reviewer_field = "deleted"
|
543 |
-
ui_color = "red"
|
544 |
-
ui_editable = False
|
545 |
-
|
546 |
-
elif SECOND_PHASE:
|
547 |
-
ui_editable = False # Transcript not editable in 2nd phase
|
548 |
-
original_annotator_being_reviewed = SECOND_PHASE_REVIEW_MAPPING.get(current_user_displaying)
|
549 |
-
|
550 |
-
if not original_annotator_being_reviewed: # Should not happen if UI is controlled properly
|
551 |
-
transcript_to_display = "Error: User not in review mapping."
|
552 |
-
ui_color = "red"
|
553 |
-
else:
|
554 |
-
ui_reviewer_field = f"Reviewing: {original_annotator_being_reviewed}"
|
555 |
-
annotation_under_review = next((ann for ann in sample_from_json.annotations or [] if ann.annotator == original_annotator_being_reviewed), None)
|
556 |
-
|
557 |
-
if annotation_under_review:
|
558 |
-
transcript_to_display = annotation_under_review.annotated_subtitle or default_transcript
|
559 |
-
ui_is_accepted_flag = (annotation_under_review.second_phase_review_status == "approved")
|
560 |
-
|
561 |
-
if annotation_under_review.second_phase_reviewed_by:
|
562 |
-
if annotation_under_review.second_phase_reviewed_by == current_user_displaying:
|
563 |
-
ui_color = "green" if annotation_under_review.second_phase_review_status == "approved" else "orange" # orange for rejected by current user
|
564 |
-
else: # Reviewed by someone else
|
565 |
-
ui_color = "gray"
|
566 |
-
ui_reviewer_field += f" (Reviewed by {annotation_under_review.second_phase_reviewed_by})"
|
567 |
-
else: # Pending review by current_user_displaying
|
568 |
-
ui_color = "yellow" # Indicates pending current user's review
|
569 |
-
else: # No annotation from original annotator for this sample
|
570 |
-
transcript_to_display = default_transcript # Show original dataset subtitle
|
571 |
-
ui_reviewer_field += " (Original annotator made no submission)"
|
572 |
-
ui_color = "lightgray" # Needs review, but based on original
|
573 |
-
|
574 |
-
else: # First Phase Logic
|
575 |
-
# Check for an accepted annotation by a first-phase reviewer
|
576 |
-
accepted_first_phase_annotation = next((a for a in sample_from_json.annotations or [] if a.is_first_phase_accepted and a.first_phase_reviewer_username), None)
|
577 |
-
|
578 |
-
if accepted_first_phase_annotation:
|
579 |
-
transcript_to_display = accepted_first_phase_annotation.annotated_subtitle or default_transcript
|
580 |
-
ui_reviewer_field = accepted_first_phase_annotation.first_phase_reviewer_username
|
581 |
-
ui_color = "green"
|
582 |
-
ui_is_accepted_flag = True
|
583 |
-
ui_editable = (get_user_role(current_user_displaying) == "reviewer") # Only 1st phase reviewer can edit accepted
|
584 |
-
else:
|
585 |
-
# Check for annotation by the current user (annotator or reviewer)
|
586 |
-
user_specific_annotation = next((a for a in sample_from_json.annotations or [] if a.annotator == current_user_displaying), None)
|
587 |
-
if user_specific_annotation:
|
588 |
-
transcript_to_display = user_specific_annotation.annotated_subtitle or default_transcript
|
589 |
-
ui_reviewer_field = user_specific_annotation.annotator
|
590 |
-
ui_color = "yellow" if absolute_idx not in unsaved_changes else "pink"
|
591 |
-
ui_editable = True
|
592 |
-
else:
|
593 |
-
# Check for annotations by other annotators (not current user, not accepted by reviewer)
|
594 |
-
# Display the first one found for a reviewer to potentially act on, or inform annotator
|
595 |
-
other_annotations = [a for a in sample_from_json.annotations or [] if a.annotator != current_user_displaying and not a.is_first_phase_accepted]
|
596 |
-
if other_annotations:
|
597 |
-
# If current user is a reviewer, they see the other annotator's work
|
598 |
-
if get_user_role(current_user_displaying) == "reviewer":
|
599 |
-
other_ann_to_show = other_annotations[0]
|
600 |
-
transcript_to_display = other_ann_to_show.annotated_subtitle or default_transcript
|
601 |
-
ui_reviewer_field = other_ann_to_show.annotator
|
602 |
-
ui_color = "blue" # Reviewer sees other's work
|
603 |
-
ui_editable = True
|
604 |
-
else: # Current user is an annotator, and another annotator worked on it
|
605 |
-
# This state is a bit ambiguous. Default to original if not assigned to this user.
|
606 |
-
# For simplicity, show original if it's not their saved work.
|
607 |
-
transcript_to_display = default_transcript
|
608 |
-
ui_reviewer_field = "labeled by another annotator"
|
609 |
-
ui_color = "lightblue"
|
610 |
-
ui_editable = False # Annotator cannot edit other annotator's unreviewed work
|
611 |
-
else: # No annotations at all, or only unreviewed by others and user is annotator
|
612 |
-
if absolute_idx in unsaved_changes:
|
613 |
-
transcript_to_display = unsaved_changes[absolute_idx]
|
614 |
-
ui_reviewer_field = current_user_displaying
|
615 |
-
ui_color = "pink"
|
616 |
-
ui_editable = True
|
617 |
-
# else, default_transcript, unreviewed, white, editable=True (already set)
|
618 |
-
|
619 |
-
# If no sample_from_json, then it's a fresh sample from dataset
|
620 |
-
# transcript_to_display remains default_transcript. ui states remain default.
|
621 |
-
# This case is hit if annotations.json doesn't have this absolute_idx yet.
|
622 |
-
|
623 |
-
# Status message update
|
624 |
-
current_page_for_status = page_idx + 1 # page_idx is 0-indexed
|
625 |
-
# If current_page_data has 'absolute_idx', we can use that
|
626 |
-
# page_num_from_abs = (absolute_idx // PAGE_SIZE) + 1
|
627 |
-
|
628 |
-
ui_status_message = f"{ui_status_message} - Page {current_page_for_status}"
|
629 |
-
if SECOND_PHASE :
|
630 |
-
ui_status_message += " (Review Phase)"
|
631 |
-
else:
|
632 |
-
ui_status_message += " (Annotation Phase)"
|
633 |
-
|
634 |
-
|
635 |
-
return audio_val, transcript_to_display, ui_status_message, ui_reviewer_field, ui_color, ui_editable, ui_is_accepted_flag, default_transcript
|
636 |
-
|
637 |
-
|
638 |
-
def load_interface_data(page_idx, idx_on_page): # Renamed from load_interface to avoid conflict
|
639 |
-
# This function is primarily a wrapper around get_sample for UI updates
|
640 |
-
audio, text, base_status, saved_reviewer_text, color, editable, accepted_flag, original_dataset_text = get_sample(page_idx, idx_on_page, CURRENT_USERNAME)
|
641 |
-
|
642 |
-
# Audio backup logic (can be simplified or removed if not strictly needed for undo_trim)
|
643 |
-
# absolute_idx = current_page_data.iloc[idx_on_page]['absolute_idx'] if current_page_data is not None and idx_on_page < len(current_page_data) else -1
|
644 |
-
# audio_entry_original = current_page_data.iloc[idx_on_page]["audio"] if current_page_data is not None and idx_on_page < len(current_page_data) else ""
|
645 |
-
# key = f"{absolute_idx}_{os.path.basename(str(get_audio_path(audio_entry_original) or 'unknown'))}"
|
646 |
-
# if key not in audio_backup and audio is not None: # Backup the audio playable value
|
647 |
-
# audio_backup[key] = audio
|
648 |
-
|
649 |
-
return (
|
650 |
-
page_idx, # current_page_idx state
|
651 |
-
idx_on_page, # current_idx_on_page state
|
652 |
-
audio, # audio_player value
|
653 |
-
gr.update(value=text, interactive=editable), # transcript update
|
654 |
-
gr.update(value=saved_reviewer_text, elem_classes=[color]), # reviewer Textbox update
|
655 |
-
base_status, # status markdown update
|
656 |
-
original_dataset_text # original_transcript state
|
657 |
-
)
|
658 |
-
|
659 |
-
# Navigation functions
|
660 |
-
def navigate_sample(page_idx, idx_on_page, direction: int): # direction: 1 for next, -1 for prev
|
661 |
-
global current_page_data, total_samples
|
662 |
-
|
663 |
-
if current_page_data is None or len(current_page_data) == 0:
|
664 |
-
return page_idx, idx_on_page, gr.update(), gr.update(), gr.update(), "No data loaded.", gr.update()
|
665 |
-
|
666 |
-
target_idx_on_page = idx_on_page + direction
|
667 |
-
|
668 |
-
new_page_idx = page_idx
|
669 |
-
new_idx_on_page = target_idx_on_page
|
670 |
-
|
671 |
-
if target_idx_on_page < 0: # Need to go to previous page
|
672 |
-
if page_idx > 0:
|
673 |
-
new_page_idx = page_idx - 1
|
674 |
-
# Load new page data and set index to last item
|
675 |
-
temp_data = load_page_data(new_page_idx)
|
676 |
-
if temp_data is not None and not temp_data.empty:
|
677 |
-
new_idx_on_page = len(temp_data) - 1
|
678 |
-
else: # Previous page is empty or out of allowed range
|
679 |
-
return page_idx, idx_on_page, gr.update(), gr.update(), gr.update(), "No more samples in this direction (prev page).", gr.update()
|
680 |
-
else: # Already on first item of first page
|
681 |
-
return page_idx, idx_on_page, gr.update(), gr.update(), gr.update(), "At the beginning of your assigned samples.", gr.update()
|
682 |
-
|
683 |
-
elif target_idx_on_page >= len(current_page_data): # Need to go to next page
|
684 |
-
new_page_idx = page_idx + 1
|
685 |
-
temp_data = load_page_data(new_page_idx) # load_page_data updates current_page_data
|
686 |
-
if temp_data is not None and not temp_data.empty:
|
687 |
-
new_idx_on_page = 0
|
688 |
-
else: # Next page is empty or out of allowed range
|
689 |
-
# Check if we are at the very end of the allowed samples
|
690 |
-
allowed_range = get_user_allowed_range(CURRENT_USERNAME)
|
691 |
-
current_abs_idx = current_page_data.iloc[idx_on_page]['absolute_idx']
|
692 |
-
if allowed_range and current_abs_idx >= allowed_range[1]:
|
693 |
-
return page_idx, idx_on_page, gr.update(), gr.update(), gr.update(), "At the end of your assigned samples.", gr.update()
|
694 |
-
else:
|
695 |
-
return page_idx, idx_on_page, gr.update(), gr.update(), gr.update(), "No more samples in this direction (next page).", gr.update()
|
696 |
-
|
697 |
-
# If we switched page, current_page_data is already updated by load_page_data.
|
698 |
-
# If staying on same page, it's fine.
|
699 |
-
return load_interface_data(new_page_idx, new_idx_on_page)
|
700 |
-
|
701 |
-
|
702 |
-
def go_next_sample_wrapper(page_idx, idx_on_page): # Simpler wrapper for UI
|
703 |
-
return navigate_sample(page_idx, idx_on_page, 1)
|
704 |
-
|
705 |
-
def go_prev_sample_wrapper(page_idx, idx_on_page): # Simpler wrapper for UI
|
706 |
-
return navigate_sample(page_idx, idx_on_page, -1)
|
707 |
-
|
708 |
-
|
709 |
-
def save_and_next_sample_first_phase(page_idx, idx_on_page, current_text, is_accepted_by_reviewer_flag):
|
710 |
-
# Note: `current_annotator_ui` (reviewer textbox value) is not who is performing action.
|
711 |
-
# CURRENT_USERNAME is performing the action.
|
712 |
-
# `is_accepted_by_reviewer_flag` is the checkbox state (true/false) if user is a reviewer.
|
713 |
-
# If user is an annotator, this flag might not be directly applicable or always false from UI.
|
714 |
-
|
715 |
-
# Determine if the current user is acting as a first-phase reviewer to use the 'accepted' flag
|
716 |
-
user_is_reviewer = get_user_role(CURRENT_USERNAME) == "reviewer"
|
717 |
-
save_msg = save_sample_data(page_idx, idx_on_page, current_text, CURRENT_USERNAME,
|
718 |
-
accepted_flag=is_accepted_by_reviewer_flag if user_is_reviewer else False)
|
719 |
-
print(save_msg) # Log save message
|
720 |
-
# Then navigate
|
721 |
-
return navigate_sample(page_idx, idx_on_page, 1)
|
722 |
-
|
723 |
-
|
724 |
-
def review_and_next_sample_second_phase(page_idx, idx_on_page, review_action: str):
|
725 |
-
feedback_msg = handle_second_phase_action(page_idx, idx_on_page, review_action)
|
726 |
-
print(feedback_msg) # Log feedback message
|
727 |
-
# Then navigate
|
728 |
-
return navigate_sample(page_idx, idx_on_page, 1)
|
729 |
-
|
730 |
-
|
731 |
-
def jump_to_absolute_idx(target_abs_idx_str, current_page_idx, current_idx_on_page): # Removed unused text/annotator params
|
732 |
-
global current_page_data
|
733 |
-
try:
|
734 |
-
target_abs_idx = int(target_abs_idx_str)
|
735 |
-
if target_abs_idx < 0: target_abs_idx = 0
|
736 |
-
|
737 |
-
allowed_range = get_user_allowed_range(CURRENT_USERNAME)
|
738 |
-
if not is_within_range(target_abs_idx, allowed_range):
|
739 |
-
status_msg = f"Target index {target_abs_idx} is outside your assigned range {allowed_range}."
|
740 |
-
# Return current state with error message
|
741 |
-
audio, text, _, rev, color, edit, acc, orig_txt = get_sample(current_page_idx, current_idx_on_page, CURRENT_USERNAME)
|
742 |
-
return current_page_idx, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt
|
743 |
-
|
744 |
-
new_page_idx = target_abs_idx // PAGE_SIZE
|
745 |
-
new_idx_on_page_conceptual = target_abs_idx % PAGE_SIZE # This is index on the conceptual new page
|
746 |
-
|
747 |
-
# Load data for the new page
|
748 |
-
temp_page_data = load_page_data(new_page_idx) # This updates global current_page_data
|
749 |
-
|
750 |
-
if temp_page_data is None or temp_page_data.empty:
|
751 |
-
status_msg = f"No data found for page {new_page_idx} containing index {target_abs_idx}."
|
752 |
-
audio, text, _, rev, color, edit, acc, orig_txt = get_sample(current_page_idx, current_idx_on_page, CURRENT_USERNAME)
|
753 |
-
return current_page_idx, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt
|
754 |
-
|
755 |
-
# Find the actual index on the loaded page for target_abs_idx
|
756 |
-
# The loaded page might not start exactly at new_page_idx * PAGE_SIZE if user's range is small.
|
757 |
-
# `load_page_data` now adds 'absolute_idx' and 'id_within_page' to `current_page_data`
|
758 |
-
|
759 |
-
# Find the row with the matching absolute_idx in the newly loaded current_page_data
|
760 |
-
matching_rows = current_page_data[current_page_data['absolute_idx'] == target_abs_idx]
|
761 |
-
if not matching_rows.empty:
|
762 |
-
new_idx_on_page_actual = matching_rows.index[0] # This is the DataFrame index, should be same as 'id_within_page'
|
763 |
-
else:
|
764 |
-
# This means target_abs_idx, though in allowed_range, was not on the loaded page (e.g. page is sparse due to filtering)
|
765 |
-
# Fallback: load the first item of the page if target not found directly.
|
766 |
-
# Or better, report an issue.
|
767 |
-
status_msg = f"Index {target_abs_idx} is in range, but not found on page {new_page_idx}. Displaying start of page."
|
768 |
-
print(status_msg) # Log this
|
769 |
-
new_idx_on_page_actual = 0 # Default to first item of the loaded page
|
770 |
-
if current_page_data.empty : # Page is actually empty
|
771 |
-
audio, text, _, rev, color, edit, acc, orig_txt = get_sample(current_page_idx, current_idx_on_page, CURRENT_USERNAME) # Revert to old view
|
772 |
-
return current_page_idx, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt
|
773 |
-
|
774 |
-
return load_interface_data(new_page_idx, new_idx_on_page_actual)
|
775 |
-
|
776 |
-
except ValueError:
|
777 |
-
status_msg = "Invalid index format for jump."
|
778 |
-
audio, text, _, rev, color, edit, acc, orig_txt = get_sample(current_page_idx, current_idx_on_page, CURRENT_USERNAME)
|
779 |
-
return current_page_idx, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt
|
780 |
-
except Exception as e:
|
781 |
-
status_msg = f"Error jumping to index: {e}"
|
782 |
-
print(status_msg)
|
783 |
-
audio, text, _, rev, color, edit, acc, orig_txt = get_sample(current_page_idx, current_idx_on_page, CURRENT_USERNAME)
|
784 |
-
return current_page_idx, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt
|
785 |
-
|
786 |
-
|
787 |
-
# Audio editing functions (simplifying, assuming these are for phase 1 only)
|
788 |
-
def trim_audio_action(page_idx, idx_on_page, trim_start_str, trim_end_str):
|
789 |
-
# This function would need significant rework if used with the new get_sample returns.
|
790 |
-
# For now, let's assume it's for phase 1 and we fetch audio path differently or disable in phase 2.
|
791 |
-
# For simplicity in this modification, advanced audio ops might be limited.
|
792 |
-
if SECOND_PHASE: return page_idx, idx_on_page, gr.Audio(), gr.Textbox(), gr.Textbox(), "Trimming disabled in Review Phase.", gr.Textbox()
|
793 |
-
|
794 |
-
# Simplified: fetch audio path if possible
|
795 |
-
audio_val, transcript, base_status, saved_reviewer, color, editable, accepted, _ = get_sample(page_idx, idx_on_page, CURRENT_USERNAME)
|
796 |
-
|
797 |
-
if not isinstance(audio_val, str) or not os.path.exists(audio_val):
|
798 |
-
# Try to get original path from current_page_data for non-raw audio
|
799 |
-
if current_page_data is not None and idx_on_page < len(current_page_data):
|
800 |
-
audio_entry = current_page_data.iloc[idx_on_page]["audio"]
|
801 |
-
resolved_path = get_audio_path(audio_entry)
|
802 |
-
if isinstance(resolved_path, str) and os.path.exists(resolved_path):
|
803 |
-
audio_val = resolved_path
|
804 |
-
else: # If it's raw audio data (tuple) or URL, or non-existent path
|
805 |
-
return page_idx, idx_on_page, audio_val, transcript, saved_reviewer, "Trimming not supported for this audio format or it's not a local file.", transcript
|
806 |
-
else:
|
807 |
-
return page_idx, idx_on_page, audio_val, transcript, saved_reviewer, "Audio data not available for trimming.", transcript
|
808 |
-
|
809 |
-
|
810 |
-
absolute_idx = current_page_data.iloc[idx_on_page]['absolute_idx']
|
811 |
-
voice_name_original = os.path.basename(str(current_page_data.iloc[idx_on_page]["audio"]))
|
812 |
-
|
813 |
-
|
814 |
-
try:
|
815 |
-
audio_seg = AudioSegment.from_file(audio_val)
|
816 |
-
start_ms = int(float(trim_start_str) * 1000)
|
817 |
-
end_ms = int(float(trim_end_str) * 1000)
|
818 |
-
trimmed_seg = audio_seg[start_ms:end_ms]
|
819 |
-
|
820 |
-
os.makedirs("trimmed_audio", exist_ok=True)
|
821 |
-
trimmed_filename = f"trimmed_{absolute_idx}_{voice_name_original}"
|
822 |
-
# Ensure unique extension, wav is usually safe
|
823 |
-
if not trimmed_filename.lower().endswith(('.wav', '.mp3', '.flac')):
|
824 |
-
trimmed_filename += ".wav"
|
825 |
-
trimmed_path = os.path.join("trimmed_audio", trimmed_filename)
|
826 |
-
|
827 |
-
# Export format might need to match original or be a standard like wav
|
828 |
-
export_format = os.path.splitext(trimmed_path)[1][1:]
|
829 |
-
if not export_format: export_format = "wav" # Default if no extension
|
830 |
-
|
831 |
-
trimmed_seg.export(trimmed_path, format=export_format)
|
832 |
-
|
833 |
-
dataset_model = load_saved_annotations()
|
834 |
-
sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
|
835 |
-
if not sample: # Should exist if we are editing it
|
836 |
-
return page_idx, idx_on_page, audio_val, transcript, saved_reviewer, "Error: Sample not found in annotations for trimming.", transcript
|
837 |
-
|
838 |
-
now = datetime.now()
|
839 |
-
# Associate trim with current user's annotation for this sample
|
840 |
-
annotation = next((a for a in sample.annotations or [] if a.annotator == CURRENT_USERNAME), None)
|
841 |
-
if not annotation: # Create if doesn't exist
|
842 |
-
annotation = Annotation(
|
843 |
-
annotator=CURRENT_USERNAME,
|
844 |
-
annotated_subtitle=transcript, # Current transcript
|
845 |
-
audio_trims=[AudioTrim(start=float(trim_start_str), end=float(trim_end_str))],
|
846 |
-
create_at=now,
|
847 |
-
update_at=now
|
848 |
-
)
|
849 |
-
sample.annotations = sample.annotations or []
|
850 |
-
sample.annotations.append(annotation)
|
851 |
-
else:
|
852 |
-
annotation.audio_trims = [AudioTrim(start=float(trim_start_str), end=float(trim_end_str))]
|
853 |
-
annotation.update_at = now
|
854 |
-
|
855 |
-
save_annotations(dataset_model)
|
856 |
-
new_status = f"{base_status} [Trimmed]"
|
857 |
-
return page_idx, idx_on_page, trimmed_path, transcript, saved_reviewer, new_status, transcript
|
858 |
-
except Exception as e:
|
859 |
-
return page_idx, idx_on_page, audio_val, transcript, saved_reviewer, f"Error trimming audio: {str(e)}", transcript
|
860 |
-
|
861 |
-
|
862 |
-
def undo_trim_action(page_idx, idx_on_page):
|
863 |
-
if SECOND_PHASE: return page_idx, idx_on_page, gr.Audio(), gr.Textbox(), gr.Textbox(), "Undo Trim disabled in Review Phase.", gr.Textbox()
|
864 |
-
|
865 |
-
audio_val, transcript, base_status, saved_reviewer, color, editable, accepted, _ = get_sample(page_idx, idx_on_page, CURRENT_USERNAME)
|
866 |
-
absolute_idx = current_page_data.iloc[idx_on_page]['absolute_idx']
|
867 |
-
voice_name_original = os.path.basename(str(current_page_data.iloc[idx_on_page]["audio"]))
|
868 |
-
|
869 |
-
dataset_model = load_saved_annotations()
|
870 |
-
sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
|
871 |
-
if sample:
|
872 |
-
annotation = next((a for a in sample.annotations or [] if a.annotator == CURRENT_USERNAME), None) # Trim is user-specific
|
873 |
-
if annotation and annotation.audio_trims:
|
874 |
-
annotation.audio_trims = None
|
875 |
-
annotation.update_at = datetime.now()
|
876 |
-
save_annotations(dataset_model)
|
877 |
-
|
878 |
-
# Restore original audio from backup or re-fetch from source dataset info
|
879 |
-
original_audio_path_or_data = current_page_data.iloc[idx_on_page]["audio"] # This is the source entry
|
880 |
-
restored_audio_val = get_audio_path(original_audio_path_or_data)
|
881 |
-
|
882 |
-
# key = f"{absolute_idx}_{voice_name_original}"
|
883 |
-
# orig_audio_backup = audio_backup.get(key) # Fetch from backup if available
|
884 |
-
# if not orig_audio_backup: # If not in backup, use the path from current_page_data
|
885 |
-
# orig_audio_backup = get_audio_path(current_page_data.iloc[idx_on_page]["audio"])
|
886 |
-
|
887 |
-
new_status = f"{base_status} [Trim undone]"
|
888 |
-
return page_idx, idx_on_page, restored_audio_val, transcript, saved_reviewer, new_status, transcript
|
889 |
-
|
890 |
-
|
891 |
-
def confirm_delete_audio_action(page_idx, idx_on_page):
|
892 |
-
if SECOND_PHASE: return page_idx, idx_on_page, gr.Audio(), gr.Textbox(), gr.Textbox(), "Delete disabled in Review Phase.", gr.Textbox()
|
893 |
-
|
894 |
-
absolute_idx = current_page_data.iloc[idx_on_page]['absolute_idx']
|
895 |
-
voice_name_original = os.path.basename(str(current_page_data.iloc[idx_on_page]["audio"]))
|
896 |
-
|
897 |
-
dataset_model = load_saved_annotations()
|
898 |
-
sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
|
899 |
-
if not sample:
|
900 |
-
sample = Sample(
|
901 |
-
id=absolute_idx,
|
902 |
-
voice_name=voice_name_original,
|
903 |
-
original_subtitle=current_page_data.iloc[idx_on_page]["sentence"],
|
904 |
-
annotations=[]
|
905 |
-
)
|
906 |
-
dataset_model.samples = dataset_model.samples or []
|
907 |
-
dataset_model.samples.append(sample)
|
908 |
-
|
909 |
-
sample.ignore_it = True
|
910 |
-
now = datetime.now()
|
911 |
-
# Create/update an annotation by CURRENT_USERNAME to mark this action
|
912 |
-
annotation = next((a for a in sample.annotations or [] if a.annotator == CURRENT_USERNAME), None)
|
913 |
-
deleted_text_marker = "AUDIO DELETED (This audio has been removed.)"
|
914 |
-
if annotation:
|
915 |
-
annotation.annotated_subtitle = deleted_text_marker
|
916 |
-
annotation.audio_trims = None # Clear trims
|
917 |
-
annotation.update_at = now
|
918 |
-
# Potentially clear review statuses if deletion overrides them
|
919 |
-
else:
|
920 |
-
annotation = Annotation(
|
921 |
-
annotator=CURRENT_USERNAME,
|
922 |
-
annotated_subtitle=deleted_text_marker,
|
923 |
-
create_at=now,
|
924 |
-
update_at=now
|
925 |
-
)
|
926 |
-
sample.annotations = sample.annotations or []
|
927 |
-
sample.annotations.append(annotation)
|
928 |
-
|
929 |
-
save_annotations(dataset_model)
|
930 |
-
|
931 |
-
new_status = f"Sample {absolute_idx+1} [Audio deleted]"
|
932 |
-
if total_samples > 0: new_status += f" of {total_samples}"
|
933 |
-
|
934 |
-
# Return values to update UI correctly after deletion
|
935 |
-
return page_idx, idx_on_page, None, deleted_text_marker, "deleted", new_status, deleted_text_marker
|
936 |
-
|
937 |
-
|
938 |
-
# Export functions (largely unchanged, ensure CURRENT_USERNAME context if it matters for export)
|
939 |
-
def sanitize_string(s):
|
940 |
-
if not isinstance(s, str): s = str(s)
|
941 |
-
return re.sub(r'[^\w-./]', '_', s)
|
942 |
-
|
943 |
-
def sanitize_sentence(s):
|
944 |
-
if not isinstance(s, str): s = str(s)
|
945 |
-
return s.encode('utf-8', errors='ignore').decode('utf-8')
|
946 |
-
|
947 |
-
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
948 |
-
def push_to_hub_with_retry(dataset_dict, repo_id, private=True, token_val=None):
|
949 |
-
if not token_val:
|
950 |
-
print("Cannot push to hub: No token provided for push_to_hub_with_retry.")
|
951 |
-
return
|
952 |
-
print(f"Pushing dataset to {repo_id}")
|
953 |
-
dataset_dict.push_to_hub(repo_id, private=private, token=token_val)
|
954 |
-
|
955 |
-
def export_to_huggingface(repo_name_str, hf_token_for_export, progress=gr.Progress()):
|
956 |
-
# This export logic needs to be carefully reviewed.
|
957 |
-
# It rebuilds a dataset from HF_DATASET_NAME and applies annotations.
|
958 |
-
# It should reflect the FINAL state of annotations (e.g., after second phase review if applicable).
|
959 |
-
# The current logic uses CURRENT_USERNAME for annotation preference, which might not be ideal for a global export.
|
960 |
-
# It should ideally use the "winning" annotation (e.g., accepted by reviewer, or approved in 2nd phase).
|
961 |
-
if not hf_token_for_export:
|
962 |
-
return "Export failed: Hugging Face token is missing."
|
963 |
-
try:
|
964 |
-
start_time = time.time()
|
965 |
-
repo_name_str = sanitize_string(repo_name_str)
|
966 |
-
print(f"Export started at {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
967 |
-
|
968 |
-
dataset_model_annotations = load_saved_annotations() # Load all annotations
|
969 |
-
|
970 |
-
# Use total_samples from global or re-fetch if necessary.
|
971 |
-
# The export should process all samples defined by total_samples.
|
972 |
-
# Let's assume total_samples is the definitive count.
|
973 |
-
if total_samples <= 0:
|
974 |
-
return "Export failed: Total number of samples is unknown or invalid."
|
975 |
-
|
976 |
-
# export_total_samples = total_samples
|
977 |
-
# Using streaming for source, but collecting all data. This can be memory intensive.
|
978 |
-
# Consider processing in true streaming fashion if dataset is very large.
|
979 |
-
|
980 |
-
ds_source = load_dataset(HF_DATASET_NAME, split="train", streaming=False) # Load non-streaming for easier iteration up to total_samples
|
981 |
-
|
982 |
-
exported_data_list = []
|
983 |
-
progress(0, f"Preparing {total_samples} samples for export...")
|
984 |
-
|
985 |
-
for i, source_sample in enumerate(ds_source):
|
986 |
-
if i >= total_samples: break # Limit to known total_samples
|
987 |
-
|
988 |
-
absolute_idx = i # Assuming source_sample is ordered and corresponds to index i
|
989 |
-
|
990 |
-
audio_entry = source_sample.get("audio")
|
991 |
-
sentence_val = source_sample.get("sentence", "") # Default original sentence
|
992 |
-
|
993 |
-
# Determine final audio and sentence based on annotations
|
994 |
-
audio_dict_to_export = None # Default to no audio if deleted or issue
|
995 |
-
|
996 |
-
# Convert audio path/data from source_sample to array for export
|
997 |
-
# This part is tricky: we need to load audio content.
|
998 |
-
# For simplicity, this example will re-use get_audio_path and then load if it's a path.
|
999 |
-
raw_audio_data = None
|
1000 |
-
audio_path_or_data = get_audio_path(audio_entry)
|
1001 |
-
if isinstance(audio_path_or_data, tuple): # Raw audio from get_audio_path
|
1002 |
-
raw_audio_data = {"array": audio_path_or_data[1], "sampling_rate": audio_path_or_data[0]}
|
1003 |
-
elif isinstance(audio_path_or_data, str) and (os.path.exists(audio_path_or_data) or audio_path_or_data.startswith("http")):
|
1004 |
-
# If it's a path, load it. This might be slow.
|
1005 |
-
# For URLs, datasets library handles loading when building Dataset object.
|
1006 |
-
# For local paths, we need to load into array.
|
1007 |
-
if os.path.exists(audio_path_or_data):
|
1008 |
-
try:
|
1009 |
-
arr, sr = sf.read(audio_path_or_data)
|
1010 |
-
raw_audio_data = {"array": arr, "sampling_rate": sr}
|
1011 |
-
except Exception as e_load:
|
1012 |
-
print(f"Warning: Could not load audio file {audio_path_or_data} for export: {e_load}")
|
1013 |
-
# raw_audio_data remains None
|
1014 |
-
else: # URL
|
1015 |
-
raw_audio_data = audio_path_or_data # Pass URL directly, Audio feature will handle
|
1016 |
-
|
1017 |
-
audio_dict_to_export = raw_audio_data
|
1018 |
-
|
1019 |
-
|
1020 |
-
# Check annotations for this sample
|
1021 |
-
annotation_data = next((s for s in dataset_model_annotations.samples or [] if s.id == absolute_idx), None)
|
1022 |
-
|
1023 |
-
if annotation_data:
|
1024 |
-
if annotation_data.ignore_it:
|
1025 |
-
sentence_val = "AUDIO DELETED (This audio has been removed.)"
|
1026 |
-
audio_dict_to_export = None # No audio
|
1027 |
-
else:
|
1028 |
-
# Determine the "best" annotation to use
|
1029 |
-
# Priority: 1. Approved in 2nd phase, 2. Accepted in 1st phase by reviewer, 3. Annotator's latest
|
1030 |
-
best_ann = None
|
1031 |
-
if annotation_data.annotations:
|
1032 |
-
# Check for 2nd phase approved
|
1033 |
-
# This needs to find the annotation that WAS approved, not make a new one.
|
1034 |
-
# The original annotator's submission that got approved.
|
1035 |
-
if annotation_data.is_approved_in_second_phase:
|
1036 |
-
# Find which annotation was approved. Iterate through them.
|
1037 |
-
for ann in annotation_data.annotations:
|
1038 |
-
if ann.second_phase_review_status == "approved":
|
1039 |
-
best_ann = ann
|
1040 |
-
break
|
1041 |
-
|
1042 |
-
if not best_ann: # Check for 1st phase accepted
|
1043 |
-
for ann in annotation_data.annotations:
|
1044 |
-
if ann.is_first_phase_accepted:
|
1045 |
-
best_ann = ann
|
1046 |
-
break
|
1047 |
-
|
1048 |
-
if not best_ann: # Fallback to any annotation (e.g., latest by timestamp or first found)
|
1049 |
-
# This could be more sophisticated, e.g. latest updated.
|
1050 |
-
# For now, take first one if multiple non-reviewed/accepted exist.
|
1051 |
-
# Or, if a specific user's annotations are primary (e.g. CURRENT_USERNAME if this is a personal export)
|
1052 |
-
# Let's assume any relevant annotation is fine if not formally accepted/approved.
|
1053 |
-
# The original code used CURRENT_USERNAME's annotation. This might be too specific for a general export.
|
1054 |
-
# Let's try to find *any* annotation from the list for the sample if no "accepted" one exists.
|
1055 |
-
if annotation_data.annotations:
|
1056 |
-
best_ann = sorted(annotation_data.annotations, key=lambda x: x.update_at, reverse=True)[0] # latest
|
1057 |
-
|
1058 |
-
if best_ann:
|
1059 |
-
sentence_val = best_ann.annotated_subtitle or sentence_val # Use annotated if available
|
1060 |
-
# Handle trimmed audio if specified in best_ann
|
1061 |
-
if best_ann.audio_trims and audio_dict_to_export: # Only if audio exists
|
1062 |
-
# This part requires that trimmed audio files are accessible and named consistently
|
1063 |
-
# The original trim_audio_action saves to "trimmed_audio/trimmed_{abs_idx}_{voice_name}"
|
1064 |
-
# We need to reconstruct this path or have a direct reference.
|
1065 |
-
# Assuming voice_name is from original sample.
|
1066 |
-
original_voice_name = sanitize_string(os.path.basename(str(get_audio_path(audio_entry) or f"sample_{absolute_idx}")))
|
1067 |
-
trimmed_path_potential = os.path.join("trimmed_audio", f"trimmed_{absolute_idx}_{original_voice_name}")
|
1068 |
-
# Ensure extension consistency for look up
|
1069 |
-
if not os.path.splitext(trimmed_path_potential)[1]: trimmed_path_potential += ".wav" # common default
|
1070 |
-
|
1071 |
-
if os.path.exists(trimmed_path_potential):
|
1072 |
-
try:
|
1073 |
-
arr, sr = sf.read(trimmed_path_potential)
|
1074 |
-
audio_dict_to_export = {"array": arr, "sampling_rate": sr}
|
1075 |
-
except Exception as e_trim_load:
|
1076 |
-
print(f"Warning: Could not load trimmed audio {trimmed_path_potential}: {e_trim_load}")
|
1077 |
-
# audio_dict_to_export remains as original loaded audio
|
1078 |
-
# else: print(f"Trimmed audio path not found: {trimmed_path_potential}")
|
1079 |
-
|
1080 |
-
exported_data_list.append({
|
1081 |
-
"audio": audio_dict_to_export, # This will be None if deleted or failed to load
|
1082 |
-
"sentence": sanitize_sentence(sentence_val)
|
1083 |
-
})
|
1084 |
-
|
1085 |
-
if (i + 1) % 100 == 0: # Progress update
|
1086 |
-
progress((i + 1) / total_samples, f"Processed {i+1}/{total_samples} samples")
|
1087 |
-
gc.collect()
|
1088 |
-
|
1089 |
-
if not exported_data_list:
|
1090 |
-
return "No data to export after processing."
|
1091 |
-
|
1092 |
-
# Create Hugging Face Dataset from the collected data
|
1093 |
-
# Filter out entries where audio is None if dataset schema requires audio
|
1094 |
-
# final_export_list = [item for item in exported_data_list if item["audio"] is not None]
|
1095 |
-
# Or handle audio being optional by schema. For Audio(), None might not be allowed if array is mandatory.
|
1096 |
-
# Let's assume for now audio can be None (e.g. deleted). If Audio() cast fails, this needs adjustment.
|
1097 |
-
# The Audio feature expects a path, dict with array/sr, or bytes. None might lead to issues.
|
1098 |
-
# Handling: if audio_dict_to_export is None, replace with a dummy silent audio array or skip sample.
|
1099 |
-
# For now, let's try passing None and see if cast_column handles it gracefully or errors.
|
1100 |
-
# It's safer to ensure 'audio' is always a valid Audio structure or path.
|
1101 |
-
# If audio is None (e.g. ignore_it=True), we should ensure the Audio feature can handle it.
|
1102 |
-
# Typically, you might replace with a path to a very short silent audio file, or an empty array if supported.
|
1103 |
-
|
1104 |
-
for item in exported_data_list:
|
1105 |
-
if item["audio"] is None: # If audio was marked for deletion / ignore_it
|
1106 |
-
# Provide a placeholder that Audio() can cast, e.g. path to a tiny silent wav or empty array
|
1107 |
-
# For simplicity, if datasets lib allows None for audio feature, this is fine.
|
1108 |
-
# Otherwise, this needs a robust placeholder.
|
1109 |
-
# A common practice is to provide a dictionary with a path to a universally accessible silent file,
|
1110 |
-
# or an empty numpy array for 'array' and a common 'sampling_rate'.
|
1111 |
-
# Let's try with an empty array.
|
1112 |
-
item["audio"] = {"array": np.array([], dtype=np.float32), "sampling_rate": 16000} # Example placeholder
|
1113 |
-
elif isinstance(item["audio"], str): # If it's a URL or path string
|
1114 |
-
# The Audio feature will handle loading this.
|
1115 |
-
pass
|
1116 |
-
elif not (isinstance(item["audio"], dict) and "array" in item["audio"] and "sampling_rate" in item["audio"]):
|
1117 |
-
print(f"Warning: Invalid audio format for export for a sample, replacing with silent audio: {item['audio']}")
|
1118 |
-
item["audio"] = {"array": np.array([], dtype=np.float32), "sampling_rate": 16000}
|
1119 |
-
|
1120 |
-
|
1121 |
-
final_dataset = Dataset.from_list(exported_data_list)
|
1122 |
-
final_dataset = final_dataset.cast_column("audio", Audio()) # Cast to Audio feature type
|
1123 |
-
|
1124 |
-
dataset_dict_export = DatasetDict({"train": final_dataset})
|
1125 |
-
|
1126 |
-
progress(0.95, "Uploading to Hugging Face...")
|
1127 |
-
push_to_hub_with_retry(
|
1128 |
-
dataset_dict=dataset_dict_export,
|
1129 |
-
repo_id=repo_name_str,
|
1130 |
-
private=True, # Assuming private, can be a parameter
|
1131 |
-
token_val=hf_token_for_export
|
1132 |
-
)
|
1133 |
-
print(f"Upload done, total time: {time.time() - start_time:.2f}s")
|
1134 |
-
progress(1.0, "Upload complete!")
|
1135 |
-
return f"Exported to huggingface.co/datasets/{repo_name_str}"
|
1136 |
-
|
1137 |
-
except Exception as e:
|
1138 |
-
error_msg = f"Export failed: {str(e)}"
|
1139 |
-
import traceback
|
1140 |
-
print(f"{error_msg}\n{traceback.format_exc()}")
|
1141 |
-
return error_msg
|
1142 |
-
|
1143 |
-
|
1144 |
-
# Login function
|
1145 |
-
def hf_login(hf_token_val):
|
1146 |
-
global CURRENT_USERNAME, token, current_page_data, total_samples, annotator_ranges
|
1147 |
-
|
1148 |
-
if not hf_token_val: # If user clears the box and clicks login
|
1149 |
-
return gr.update(visible=True), gr.update(visible=False), "", "", "Login failed: Token cannot be empty."
|
1150 |
-
|
1151 |
-
try:
|
1152 |
-
user_info = whoami(token=hf_token_val)
|
1153 |
-
username = user_info['name']
|
1154 |
-
|
1155 |
-
if username in ALLOWED_USERS:
|
1156 |
-
CURRENT_USERNAME = username
|
1157 |
-
token = hf_token_val # Store the validated token globally for other HF ops
|
1158 |
-
|
1159 |
-
# Initialize/re-initialize dataset info and ranges based on logged-in user
|
1160 |
-
# This ensures that if total_samples was not fetched, it's attempted again.
|
1161 |
-
ds_info = get_dataset_info() # Sets global total_samples
|
1162 |
-
if total_samples > 0:
|
1163 |
-
annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
|
1164 |
-
if SECOND_PHASE:
|
1165 |
-
initialize_second_phase_assignments() # Depends on ANNOTATORS and their ranges
|
1166 |
-
else:
|
1167 |
-
# Handle case where total_samples is still unknown (critical for ranges)
|
1168 |
-
return gr.update(visible=True), gr.update(visible=False), "", hf_token_val, "Login successful, but failed to get dataset size. Cannot proceed."
|
1169 |
-
|
1170 |
-
|
1171 |
-
# Load initial page data for this user
|
1172 |
-
current_page_data = load_page_data(0) # page 0 for the current user
|
1173 |
-
|
1174 |
-
# Determine initial UI state based on SECOND_PHASE
|
1175 |
-
is_second_phase_active = SECOND_PHASE
|
1176 |
-
|
1177 |
-
# Update visibility of components based on phase
|
1178 |
-
updates = {
|
1179 |
-
# Phase 1 components
|
1180 |
-
"save_next_button_vis": not is_second_phase_active,
|
1181 |
-
"transcript_interactive": not is_second_phase_active,
|
1182 |
-
"trim_button_vis": not is_second_phase_active,
|
1183 |
-
"undo_trim_button_vis": not is_second_phase_active,
|
1184 |
-
"delete_button_vis": not is_second_phase_active,
|
1185 |
-
"first_phase_accept_cb_vis": (not is_second_phase_active and get_user_role(CURRENT_USERNAME) == "reviewer"),
|
1186 |
-
# Phase 2 components
|
1187 |
-
"approve_button_vis": is_second_phase_active,
|
1188 |
-
"reject_button_vis": is_second_phase_active,
|
1189 |
-
}
|
1190 |
-
|
1191 |
-
initial_load = load_interface_data(0, 0) # Load data for the first sample (page 0, index 0 on page)
|
1192 |
-
|
1193 |
-
# Return tuple for outputs matching login_button.click()
|
1194 |
-
# login_container, main_container, reviewer_textbox (as initial state), hf_token_state, login_message,
|
1195 |
-
# then all the visibility/interactivity updates
|
1196 |
-
return (
|
1197 |
-
gr.update(visible=False), # login_container
|
1198 |
-
gr.update(visible=True), # main_container
|
1199 |
-
initial_load[4], # reviewer_textbox gr.update object (initial_load[4] is reviewer text gr.update)
|
1200 |
-
hf_token_val, # hf_token_state
|
1201 |
-
f"Login successful! Welcome {CURRENT_USERNAME}. Phase: {'Review' if SECOND_PHASE else 'Annotation'}.", # login_message
|
1202 |
-
|
1203 |
-
# UI component updates based on phase
|
1204 |
-
gr.update(visible=updates["save_next_button_vis"]),
|
1205 |
-
gr.update(interactive=updates["transcript_interactive"]), # This is for transcript Textarea
|
1206 |
-
gr.update(visible=updates["trim_button_vis"]),
|
1207 |
-
gr.update(visible=updates["undo_trim_button_vis"]),
|
1208 |
-
gr.update(visible=updates["delete_button_vis"]),
|
1209 |
-
gr.update(visible=updates["first_phase_accept_cb_vis"]),
|
1210 |
-
gr.update(visible=updates["approve_button_vis"]),
|
1211 |
-
gr.update(visible=updates["reject_button_vis"]),
|
1212 |
-
|
1213 |
-
# Initial data for the interface elements from load_interface_data
|
1214 |
-
initial_load[0], # page_idx_state
|
1215 |
-
initial_load[1], # idx_on_page_state
|
1216 |
-
initial_load[2], # audio_player
|
1217 |
-
initial_load[3], # transcript (already includes interactivity)
|
1218 |
-
# initial_load[4] is reviewer, already used above for initial value
|
1219 |
-
initial_load[5], # status_md
|
1220 |
-
initial_load[6], # original_transcript_state
|
1221 |
-
)
|
1222 |
-
|
1223 |
-
else:
|
1224 |
-
CURRENT_USERNAME = None
|
1225 |
-
return gr.update(visible=True), gr.update(visible=False), "", hf_token_val, "User not authorized!"
|
1226 |
-
except Exception as e:
|
1227 |
-
CURRENT_USERNAME = None
|
1228 |
-
import traceback
|
1229 |
-
print(f"Login failed: {str(e)}\n{traceback.format_exc()}")
|
1230 |
-
return gr.update(visible=True), gr.update(visible=False), "", hf_token_val, f"Login failed: {str(e)}"
|
1231 |
-
|
1232 |
-
|
1233 |
-
# Set initial values for UI elements before login (mostly empty or default)
|
1234 |
-
init_page_idx = 0
|
1235 |
-
init_idx_on_page = 0
|
1236 |
-
init_audio_val = None
|
1237 |
-
init_transcript_val = gr.update(value="", interactive=False) # Non-interactive before login
|
1238 |
-
init_reviewer_val = gr.update(value="N/A", interactive=False)
|
1239 |
-
init_status_val = "Please log in."
|
1240 |
-
init_original_text_val = ""
|
1241 |
-
|
1242 |
# Gradio Interface
|
1243 |
css = """
|
1244 |
.white { background-color: white; color: black; }
|
@@ -1316,13 +75,10 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
|
|
1316 |
save_next_button, transcript_tb, trim_button, undo_trim_button, delete_button,
|
1317 |
first_phase_accept_cb, approve_button, reject_button,
|
1318 |
# Initial data load updates
|
1319 |
-
current_page_idx_state, current_idx_on_page_state, audio_player, transcript_tb,
|
1320 |
status_md, original_transcript_state
|
1321 |
]
|
1322 |
-
|
1323 |
-
# hf_login returns initial_load[3] which is gr.update(value=text, interactive=editable) for transcript.
|
1324 |
-
# So, one update to transcript_tb should be sufficient if it carries both value and interactivity.
|
1325 |
-
|
1326 |
login_button.click(
|
1327 |
fn=hf_login,
|
1328 |
inputs=[hf_token_input],
|
@@ -1341,17 +97,15 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
|
|
1341 |
inputs=[current_page_idx_state, current_idx_on_page_state, transcript_tb, first_phase_accept_cb],
|
1342 |
outputs=navigation_outputs
|
1343 |
)
|
1344 |
-
|
1345 |
next_button.click(
|
1346 |
-
fn=go_next_sample_wrapper,
|
1347 |
inputs=[current_page_idx_state, current_idx_on_page_state],
|
1348 |
outputs=navigation_outputs
|
1349 |
-
)
|
1350 |
-
None, None, None, _js="() => { /* Clear unsaved visual cues if any */ }"
|
1351 |
-
)
|
1352 |
|
1353 |
prev_button.click(
|
1354 |
-
fn=go_prev_sample_wrapper,
|
1355 |
inputs=[current_page_idx_state, current_idx_on_page_state],
|
1356 |
outputs=navigation_outputs
|
1357 |
)
|
@@ -1359,12 +113,12 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
|
|
1359 |
# Phase 2 actions
|
1360 |
approve_button.click(
|
1361 |
fn=review_and_next_sample_second_phase,
|
1362 |
-
inputs=[current_page_idx_state, current_idx_on_page_state, gr.State("approved")],
|
1363 |
outputs=navigation_outputs
|
1364 |
)
|
1365 |
reject_button.click(
|
1366 |
fn=review_and_next_sample_second_phase,
|
1367 |
-
inputs=[current_page_idx_state, current_idx_on_page_state, gr.State("rejected")],
|
1368 |
outputs=navigation_outputs
|
1369 |
)
|
1370 |
|
@@ -1372,15 +126,15 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
|
|
1372 |
trim_button.click(
|
1373 |
fn=trim_audio_action,
|
1374 |
inputs=[current_page_idx_state, current_idx_on_page_state, trim_start_tb, trim_end_tb],
|
1375 |
-
outputs=navigation_outputs
|
1376 |
)
|
1377 |
undo_trim_button.click(
|
1378 |
fn=undo_trim_action,
|
1379 |
inputs=[current_page_idx_state, current_idx_on_page_state],
|
1380 |
outputs=navigation_outputs
|
1381 |
)
|
1382 |
-
delete_button.click(
|
1383 |
-
fn=confirm_delete_audio_action,
|
1384 |
inputs=[current_page_idx_state, current_idx_on_page_state],
|
1385 |
outputs=navigation_outputs
|
1386 |
)
|
@@ -1395,17 +149,14 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
|
|
1395 |
fn=export_to_huggingface,
|
1396 |
inputs=[hf_repo_name_tb, hf_token_state],
|
1397 |
outputs=[hf_export_status_md],
|
1398 |
-
queue=True
|
1399 |
)
|
1400 |
|
1401 |
# Launch the interface
|
1402 |
if __name__ == "__main__":
|
1403 |
-
# For testing, you might want to set SECOND_PHASE here or via environment variable
|
1404 |
-
# Example: os.environ.get("APP_SECOND_PHASE", "False").lower() == "true"
|
1405 |
-
# SECOND_PHASE = True # Force second phase for testing
|
1406 |
if SECOND_PHASE:
|
1407 |
print("==== APPLICATION RUNNING IN SECOND PHASE (REVIEW MODE) ====")
|
1408 |
else:
|
1409 |
print("==== APPLICATION RUNNING IN FIRST PHASE (ANNOTATION MODE) ====")
|
1410 |
|
1411 |
-
demo.queue().launch(debug=True, share=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Gradio Interface
|
2 |
css = """
|
3 |
.white { background-color: white; color: black; }
|
|
|
75 |
save_next_button, transcript_tb, trim_button, undo_trim_button, delete_button,
|
76 |
first_phase_accept_cb, approve_button, reject_button,
|
77 |
# Initial data load updates
|
78 |
+
current_page_idx_state, current_idx_on_page_state, audio_player, transcript_tb,
|
79 |
status_md, original_transcript_state
|
80 |
]
|
81 |
+
|
|
|
|
|
|
|
82 |
login_button.click(
|
83 |
fn=hf_login,
|
84 |
inputs=[hf_token_input],
|
|
|
97 |
inputs=[current_page_idx_state, current_idx_on_page_state, transcript_tb, first_phase_accept_cb],
|
98 |
outputs=navigation_outputs
|
99 |
)
|
100 |
+
|
101 |
next_button.click(
|
102 |
+
fn=go_next_sample_wrapper,
|
103 |
inputs=[current_page_idx_state, current_idx_on_page_state],
|
104 |
outputs=navigation_outputs
|
105 |
+
) # REMOVED the problematic .then() call here
|
|
|
|
|
106 |
|
107 |
prev_button.click(
|
108 |
+
fn=go_prev_sample_wrapper,
|
109 |
inputs=[current_page_idx_state, current_idx_on_page_state],
|
110 |
outputs=navigation_outputs
|
111 |
)
|
|
|
113 |
# Phase 2 actions
|
114 |
approve_button.click(
|
115 |
fn=review_and_next_sample_second_phase,
|
116 |
+
inputs=[current_page_idx_state, current_idx_on_page_state, gr.State("approved")],
|
117 |
outputs=navigation_outputs
|
118 |
)
|
119 |
reject_button.click(
|
120 |
fn=review_and_next_sample_second_phase,
|
121 |
+
inputs=[current_page_idx_state, current_idx_on_page_state, gr.State("rejected")],
|
122 |
outputs=navigation_outputs
|
123 |
)
|
124 |
|
|
|
126 |
trim_button.click(
|
127 |
fn=trim_audio_action,
|
128 |
inputs=[current_page_idx_state, current_idx_on_page_state, trim_start_tb, trim_end_tb],
|
129 |
+
outputs=navigation_outputs
|
130 |
)
|
131 |
undo_trim_button.click(
|
132 |
fn=undo_trim_action,
|
133 |
inputs=[current_page_idx_state, current_idx_on_page_state],
|
134 |
outputs=navigation_outputs
|
135 |
)
|
136 |
+
delete_button.click(
|
137 |
+
fn=confirm_delete_audio_action,
|
138 |
inputs=[current_page_idx_state, current_idx_on_page_state],
|
139 |
outputs=navigation_outputs
|
140 |
)
|
|
|
149 |
fn=export_to_huggingface,
|
150 |
inputs=[hf_repo_name_tb, hf_token_state],
|
151 |
outputs=[hf_export_status_md],
|
152 |
+
queue=True
|
153 |
)
|
154 |
|
155 |
# Launch the interface
|
156 |
if __name__ == "__main__":
|
|
|
|
|
|
|
157 |
if SECOND_PHASE:
|
158 |
print("==== APPLICATION RUNNING IN SECOND PHASE (REVIEW MODE) ====")
|
159 |
else:
|
160 |
print("==== APPLICATION RUNNING IN FIRST PHASE (ANNOTATION MODE) ====")
|
161 |
|
162 |
+
demo.queue().launch(debug=True, share=False)
|