Spaces:

navidved
/

audio-labelling

Running

App Files Files Community

navidved commited on 4 days ago

Commit

47eb7ca

verified ·

1 Parent(s): 33d3ab3

Update app.py

Browse files

Files changed (1) hide show

app.py +394 -417

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 import json
 import pandas as pd
 from datasets import load_dataset, DatasetDict, Dataset, Audio
-from huggingface_hub import HfApi, whoami, login, hf_hub_download
 import tempfile
 import shutil
 import gc
@@ -17,39 +17,44 @@ import numpy as np
 from pydantic import BaseModel
 from typing import Optional, List, Tuple
 from datetime import datetime
-import requests # Added for trim_audio_action
 # Log in with Hugging Face token
 token = os.getenv("hf_token")
 if token:
-    login(token)
 else:
-    print("Warning: hf_token environment variable not set. Hugging Face Hub operations might fail.")
 # Configuration
 HF_DATASET_NAME = "navidved/channelb-raw-data"
-AUDIO_DIR = "audio" # Not actively used if paths are absolute or in dataset item
-SAVE_PATH = "annotations.json"
 ALLOWED_USERS = ["shahab7", "amirnamini23", "Mohsen711", "mahya2025", "najmeh00", "sepehr21ar", "zahraemarati", "moghim72", "amin76", "vargha", "navidved"]
 REVIEWERS = ["vargha", "navidved"]
 ANNOTATORS = [user for user in ALLOWED_USERS if user not in REVIEWERS]
 CURRENT_USERNAME = None
 PAGE_SIZE = 100
-SAVE_INTERVAL = 10
 # --- SECOND PHASE CONFIGURATION ---
-SECOND_PHASE = False # Set to True to activate second phase review
-SECOND_PHASE_REVIEW_MAPPING = {} # Populated if SECOND_PHASE is True. Maps: reviewer_username -> original_annotator_username
 # Global state variables
-current_page = 0 # Stores the USER-RELATIVE page index
-# ds_iter = None # No longer maintained globally for streaming robustness
-current_page_data = None # Pandas DataFrame for the current page's data
-audio_backup = {} # For undo_trim, if needed (simplified)
-annotation_count = 0
 unsaved_changes = {}
-total_samples = 0 # Total samples in the HF_DATASET_NAME
-annotator_ranges = {} # Stores {annotator_username: (start_abs_idx, end_abs_idx)}
 # Pydantic data models
 class AudioTrim(BaseModel):
@@ -69,7 +74,7 @@ class Annotation(BaseModel):
     update_at: datetime
 class Sample(BaseModel):
-    id: int # Absolute index in the dataset
     voice_name: str
     original_subtitle: str
     ignore_it: bool = False
@@ -84,89 +89,119 @@ class DatasetModel(BaseModel):
 def load_saved_annotations():
     dataset_model = None
     local_file_loaded_successfully = False
-    # Phase 1: Try to load from local SAVE_PATH
     if os.path.exists(SAVE_PATH):
         try:
             with open(SAVE_PATH, "r", encoding="utf-8") as f:
                 data = json.load(f)
-                # Perform a basic check if data seems like a DatasetModel structure
-                if "samples" in data or not data: # Allow empty dict for initially empty model
-                    dataset_model = DatasetModel(**data)
-                    print(f"Loaded annotations from local JSON file: {SAVE_PATH}")
-                    local_file_loaded_successfully = True
-                else:
-                    print(f"Local JSON file {SAVE_PATH} does not seem to have the correct structure. Will ignore.")
-                    dataset_model = None # Explicitly set to None
         except Exception as e:
-            print(f"Error loading local JSON file '{SAVE_PATH}': {str(e)}. Corrupt file? Will try HF Hub if token available, or create new.")
-            # Optionally, rename corrupt local file to prevent repeated load errors from it
             try:
                 corrupt_path = SAVE_PATH + ".corrupt." + datetime.now().strftime("%Y%m%d%H%M%S%f")
                 os.rename(SAVE_PATH, corrupt_path)
                 print(f"Renamed corrupt local file to {corrupt_path}")
             except OSError as re_e:
                 print(f"Could not rename corrupt local file: {re_e}")
-            dataset_model = None # Ensure it's None if local load failed
-    # Phase 2: If local load failed or file didn't exist, and token is available, try HF Hub
-    # Only attempt HF download if local load was not successful.
     if not local_file_loaded_successfully and token:
-        print("Local annotations not loaded or not found/corrupt. Trying Hugging Face Hub...")
         try:
             hf_path = hf_hub_download(
                 repo_id=HF_DATASET_NAME,
-                filename=os.path.basename(SAVE_PATH), # Use basename in case SAVE_PATH is a full path
                 repo_type="dataset",
                 token=token
             )
             with open(hf_path, "r", encoding="utf-8") as f:
                 data = json.load(f)
-                dataset_model = DatasetModel(**data)
-            # Cache it locally after successful download
             with open(SAVE_PATH, "w", encoding="utf-8") as f_cache:
                 f_cache.write(dataset_model.model_dump_json(exclude_none=True, indent=4))
-            print(f"Loaded annotations from HF dataset repository '{HF_DATASET_NAME}/{os.path.basename(SAVE_PATH)}' and cached locally to '{SAVE_PATH}'.")
-        except Exception as e: # Catches HfHubHTTPError (like 404) and other issues
-            print(f"Error loading JSON file from HF repo '{HF_DATASET_NAME}/{os.path.basename(SAVE_PATH)}': {str(e)}")
-            # If HF load fails, dataset_model remains as it was (None if local also failed/absent)
-    # Phase 3: If still no dataset_model (neither local nor HF Hub worked), create a new empty one
     if dataset_model is None:
         print("No valid annotations found locally or on HF Hub (or failed to load). Creating new empty DatasetModel.")
         dataset_model = DatasetModel(samples=[])
     return dataset_model
-def save_annotations(dataset_model: DatasetModel):
-    global annotation_count
-    try:
-        with open(SAVE_PATH, "w", encoding="utf-8") as f:
-            f.write(dataset_model.model_dump_json(exclude_none=True, indent=4))
-        print(f"Saved annotations to {SAVE_PATH}")
-        annotation_count += 1
-        if annotation_count % SAVE_INTERVAL == 0 and token:
-            push_json_to_hf()
-    except Exception as e:
-        print(f"Error saving annotations: {str(e)}")
 def push_json_to_hf():
     if not token:
-        print("Cannot push to HF: token not available.")
         return
     try:
         api = HfApi()
         api.upload_file(
-            path_or_fileobj=SAVE_PATH,
-            path_in_repo=os.path.basename(SAVE_PATH), # Use basename
             repo_type="dataset",
             repo_id=HF_DATASET_NAME,
-            token=token
         )
-        print(f"Uploaded {os.path.basename(SAVE_PATH)} to Hugging Face repository {HF_DATASET_NAME}")
     except Exception as e:
-        print(f"Error uploading JSON file: {str(e)}")
 def calculate_annotator_ranges(total_samples_val, annotators_list):
     num_annotators = len(annotators_list)
@@ -184,7 +219,7 @@ def calculate_annotator_ranges(total_samples_val, annotators_list):
             end_idx += 1
         if end_idx >= total_samples_val:
             end_idx = total_samples_val -1
-        if start_idx <= end_idx: # Ensure valid range
              ranges[annotator] = (start_idx, end_idx)
         start_idx = end_idx + 1
     print(f"Calculated annotator ranges: {ranges}")
@@ -209,7 +244,7 @@ def initialize_second_phase_assignments():
         SECOND_PHASE_REVIEW_MAPPING[annotator] = annotator
         print(f"Second phase: {annotator} will review their own work.")
     else:
-        for i, reviewer_user in enumerate(ANNOTATORS): # In 2nd phase, ANNOTATORS become reviewers of other ANNOTATORS
             original_annotator_idx = (i - 1 + len(ANNOTATORS)) % len(ANNOTATORS)
             original_annotator_user = ANNOTATORS[original_annotator_idx]
             SECOND_PHASE_REVIEW_MAPPING[reviewer_user] = original_annotator_user
@@ -220,24 +255,31 @@ def initialize_second_phase_assignments():
             print(f"Warning: Original annotator {original_annotator} (being reviewed by {reviewer}) has no range defined in annotator_ranges.")
 def get_user_allowed_range(username):
-    global annotator_ranges, total_samples
     if SECOND_PHASE:
-        if not SECOND_PHASE_REVIEW_MAPPING:
-            initialize_second_phase_assignments()
         original_annotator_to_review = SECOND_PHASE_REVIEW_MAPPING.get(username)
         if original_annotator_to_review:
-            if not annotator_ranges and total_samples > 0:
                  annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
             user_range = annotator_ranges.get(original_annotator_to_review)
             return user_range
-        else:
-            return None
     else: # First Phase Logic
         if get_user_role(username) == "reviewer":
             return (0, total_samples - 1) if total_samples > 0 else None
-        elif username in annotator_ranges:
             return annotator_ranges[username]
         else:
             return None
@@ -255,6 +297,7 @@ def get_dataset_info():
     if total_samples > 0:
         return {'num_samples': total_samples}
     try:
         ds_info_obj = load_dataset(HF_DATASET_NAME, split="train", streaming=False)
         num_samples_val = ds_info_obj.num_rows
         if num_samples_val and num_samples_val > 0:
@@ -262,11 +305,12 @@ def get_dataset_info():
             print(f"Dataset info: total_samples set to {total_samples}")
             return {'num_samples': total_samples}
         else:
-            print("Warning: ds_info_obj.num_rows was not positive. Trying iteration for count (may be slow).")
             ds_stream = load_dataset(HF_DATASET_NAME, split="train", streaming=True)
             count = 0
-            for _ in ds_stream:
                 count +=1
             if count > 0:
                 total_samples = count
                 print(f"Dataset info: total_samples set to {total_samples} by iteration.")
@@ -276,19 +320,20 @@ def get_dataset_info():
                 total_samples = -1
                 return {'num_samples': -1}
     except Exception as e:
-        print(f"Error getting dataset info: {e}")
         total_samples = -1
         return {'num_samples': -1}
-# Initial data load (moved after functions it calls are defined)
-dataset_info = get_dataset_info() # This sets global total_samples
-if total_samples > 0:
-    annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
-    if SECOND_PHASE:
-        initialize_second_phase_assignments()
-else:
-    print("Warning: total_samples is not positive. Annotation ranges and second phase assignments may be incorrect.")
-    annotator_ranges = {}
 def get_audio_path(audio_entry):
     if isinstance(audio_entry, dict):
@@ -307,11 +352,9 @@ def get_audio_path(audio_entry):
         return audio_entry
     return None
-# --- MODIFIED FUNCTION ---
 def load_page_data(page_num_within_user_view=0):
     global current_page_data, current_page
-    # Default to empty DataFrame; current_page updated to reflect attempt
     current_page_data = pd.DataFrame(columns=["audio", "sentence", "id_within_page", "absolute_idx"])
     current_page = page_num_within_user_view
@@ -326,24 +369,18 @@ def load_page_data(page_num_within_user_view=0):
         print(f"User {CURRENT_USERNAME} has an invalid allowed range: {user_allowed_range}")
         return current_page_data
-    # Calculate the global start index for this page based on user's view
     page_global_start_idx = user_start_abs + (page_num_within_user_view * PAGE_SIZE)
     if page_global_start_idx > user_end_abs:
         print(f"Requested page {page_num_within_user_view} (abs start {page_global_start_idx}) is beyond user {CURRENT_USERNAME}'s allowed samples end ({user_end_abs}).")
-        return current_page_data # Return empty DataFrame
-    # Calculate the global end index for this page, capped by user's total assigned samples
     page_global_end_idx = min(page_global_start_idx + PAGE_SIZE - 1, user_end_abs)
-    # Calculate how many samples are actually on this page
     num_samples_on_this_page = page_global_end_idx - page_global_start_idx + 1
     if num_samples_on_this_page <= 0:
-        # This might happen if page_global_start_idx is valid but page_global_end_idx calculation results in non-positive samples
-        # (e.g. page_global_start_idx is exactly user_end_abs + 1 after rounding from a large page_num_within_user_view)
         print(f"No samples for user {CURRENT_USERNAME} on their page {page_num_within_user_view}. Calculated range for page: [{page_global_start_idx}-{page_global_end_idx}]")
-        return current_page_data # Return empty DataFrame
     print(f"Loading page {page_num_within_user_view} for user {CURRENT_USERNAME}. "
           f"Effective absolute dataset range for this page: [{page_global_start_idx}-{page_global_end_idx}] "
@@ -351,29 +388,21 @@ def load_page_data(page_num_within_user_view=0):
           f"Will attempt to load {num_samples_on_this_page} samples.")
     try:
-        ds_full = load_dataset(HF_DATASET_NAME, split="train", streaming=True)
-        # Efficiently skip to the start of the page
         ds_page_specific = ds_full.skip(page_global_start_idx)
-        # Take only the samples needed for this page
         page_iterable = ds_page_specific.take(num_samples_on_this_page)
     except Exception as e:
         print(f"Error loading or processing dataset via skip/take for page data: {e}")
-        # current_page_data is already empty from the top, current_page updated
         return current_page_data
     samples_on_page_list = []
-    # The absolute index for items coming from page_iterable will start at page_global_start_idx
     current_processing_abs_idx = page_global_start_idx
-    # Iterate through the samples for the current page
     for id_on_page_counter, sample_data_item in enumerate(page_iterable):
         sample_data_item['absolute_idx'] = current_processing_abs_idx
-        sample_data_item['id_within_page'] = id_on_page_counter # 0-indexed for the page
         samples_on_page_list.append(sample_data_item)
         current_processing_abs_idx += 1
-        # Safety break: if we've collected enough samples for the page
         if id_on_page_counter + 1 >= num_samples_on_this_page:
             break
@@ -383,17 +412,26 @@ def load_page_data(page_num_within_user_view=0):
               f"First abs_idx: {samples_on_page_list[0]['absolute_idx']}, "
               f"Last abs_idx: {samples_on_page_list[-1]['absolute_idx']}.")
     else:
-        # This case might occur if .take() returns fewer items than expected (e.g., dataset is shorter than total_samples indicated)
         print(f"No samples were loaded for page {page_num_within_user_view} (user: {CURRENT_USERNAME}) "
-              f"despite expecting {num_samples_on_this_page} from range [{page_global_start_idx}-{page_global_end_idx}]. "
-              f"This could mean the source dataset is shorter than anticipated or an issue with streaming/take().")
-        # current_page_data remains empty
     gc.collect()
     return current_page_data
-# --- END OF MODIFIED FUNCTION ---
-# Core functions
 def save_sample_data(page_idx, idx_on_page, transcript, current_user_performing_action, accepted_flag=False):
     global current_page_data, unsaved_changes
@@ -411,7 +449,7 @@ def save_sample_data(page_idx, idx_on_page, transcript, current_user_performing_
     audio_entry_original = actual_sample_info["audio"]
     voice_name = os.path.basename(str(get_audio_path(audio_entry_original) or f"sample_{absolute_idx}"))
-    dataset_model = load_saved_annotations()
     sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
     if not sample:
@@ -462,7 +500,7 @@ def save_sample_data(page_idx, idx_on_page, transcript, current_user_performing_
     if absolute_idx in unsaved_changes:
         del unsaved_changes[absolute_idx]
-    save_annotations(dataset_model)
     return f"✓ Saved annotation for sample {absolute_idx}"
 def handle_second_phase_action(page_idx, idx_on_page, action: str):
@@ -491,7 +529,7 @@ def handle_second_phase_action(page_idx, idx_on_page, action: str):
         print(f"Warning: No prior annotation by {original_annotator_to_review} for sample {absolute_idx}. Creating placeholder for review.")
         annotation_to_review = Annotation(
             annotator=original_annotator_to_review,
-            annotated_subtitle=sample.original_subtitle,
             create_at=datetime.now(),
             update_at=datetime.now()
         )
@@ -505,8 +543,7 @@ def handle_second_phase_action(page_idx, idx_on_page, action: str):
     if action == "approved":
         sample.is_approved_in_second_phase = True
-    # else:
-    #    sample.is_approved_in_second_phase = False
     save_annotations(dataset_model)
     return f"✓ Review ({action}) saved for sample {absolute_idx} (Original annotator: {original_annotator_to_review})"
@@ -515,6 +552,7 @@ def get_sample(page_idx_user_relative, idx_on_page, current_user_displaying):
     global current_page_data, total_samples
     if current_page_data is None or idx_on_page < 0 or idx_on_page >= len(current_page_data):
         return None, "", f"Invalid index ({idx_on_page}) for current page data (len {len(current_page_data) if current_page_data is not None else 'None'}).", "unreviewed", "white", True, False, "", gr.update(visible=False)
     actual_sample_info = current_page_data.iloc[idx_on_page]
@@ -523,23 +561,28 @@ def get_sample(page_idx_user_relative, idx_on_page, current_user_displaying):
     audio_entry_original = actual_sample_info["audio"]
     audio_val = get_audio_path(audio_entry_original)
-    default_transcript = actual_sample_info["sentence"]
     transcript_to_display = default_transcript
     ui_reviewer_field = "unreviewed"
     ui_color = "white"
     ui_editable = True
-    ui_is_accepted_flag = False # For 1st phase reviewer checkbox state
     status_prefix = ""
     user_allowed_range = get_user_allowed_range(current_user_displaying)
     if user_allowed_range:
         user_start_abs, user_end_abs = user_allowed_range
-        current_sample_num_in_user_assignment = absolute_idx - user_start_abs + 1
-        total_samples_for_user = user_end_abs - user_start_abs + 1
-        status_prefix = f"Sample {current_sample_num_in_user_assignment} of {total_samples_for_user} for you (Abs Idx {absolute_idx})."
     else:
-        status_prefix = f"Sample (Abs Idx {absolute_idx})."
     dataset_model = load_saved_annotations()
     sample_from_json = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
@@ -566,7 +609,6 @@ def get_sample(page_idx_user_relative, idx_on_page, current_user_displaying):
                 if annotation_under_review:
                     transcript_to_display = annotation_under_review.annotated_subtitle or default_transcript
-                    # ui_is_accepted_flag refers to the 2nd phase review status by *this* reviewer
                     ui_is_accepted_flag = (annotation_under_review.second_phase_review_status == "approved" and
                                           annotation_under_review.second_phase_reviewed_by == current_user_displaying)
@@ -577,7 +619,7 @@ def get_sample(page_idx_user_relative, idx_on_page, current_user_displaying):
                             ui_color = "gray"
                             ui_reviewer_field += f" (Already reviewed by {annotation_under_review.second_phase_reviewed_by} as {annotation_under_review.second_phase_review_status})"
                     else:
-                        ui_color = "yellow" # Pending this user's review
                 else:
                     transcript_to_display = default_transcript
                     ui_reviewer_field += " (No submission by original annotator)"
@@ -590,7 +632,7 @@ def get_sample(page_idx_user_relative, idx_on_page, current_user_displaying):
                 transcript_to_display = accepted_first_phase_annotation.annotated_subtitle or default_transcript
                 ui_reviewer_field = f"Accepted by: {accepted_first_phase_annotation.first_phase_reviewer_username}"
                 ui_color = "green"
-                ui_is_accepted_flag = True # Checkbox state reflects this acceptance
                 ui_editable = (get_user_role(current_user_displaying) == "reviewer")
             else:
                 user_specific_annotation = next((a for a in sample_from_json.annotations or [] if a.annotator == current_user_displaying), None)
@@ -599,25 +641,21 @@ def get_sample(page_idx_user_relative, idx_on_page, current_user_displaying):
                     ui_reviewer_field = f"Your draft (as {user_specific_annotation.annotator})"
                     ui_color = "yellow"
                     ui_editable = True
-                    # ui_is_accepted_flag remains False, as it's user's own draft, not yet "accepted" by a reviewer role
-                else: # No accepted ann, no user-specific ann. Check for other unaccepted annotations.
                     other_annotations = [a for a in sample_from_json.annotations or [] if not a.is_first_phase_accepted]
                     if other_annotations:
                         if get_user_role(current_user_displaying) == "reviewer":
-                            other_ann_to_show = other_annotations[0] # Show first other draft
                             transcript_to_display = other_ann_to_show.annotated_subtitle or default_transcript
                             ui_reviewer_field = f"Draft by: {other_ann_to_show.annotator}"
                             ui_color = "blue"
                             ui_editable = True
-                            # ui_is_accepted_flag remains False (reviewer is seeing other's draft)
-                        else: # Annotator sees original if not their work and not accepted
                             transcript_to_display = default_transcript
                             ui_reviewer_field = f"Labeled by: {other_annotations[0].annotator}"
                             ui_color = "lightblue"
-                            ui_editable = False # Annotator cannot edit another annotator's unreviewed work
-                    # else: default_transcript, "unreviewed", "white", editable=True already set
-    # Override color for unsaved changes visual cue in first phase
     if not SECOND_PHASE and absolute_idx in unsaved_changes:
         ui_color = "pink"
@@ -627,24 +665,24 @@ def get_sample(page_idx_user_relative, idx_on_page, current_user_displaying):
     else:
         ui_status_message += " (Annotation Phase)"
-    # Determine visibility of the "Accept" checkbox (only for reviewers in 1st phase)
     show_accept_checkbox = not SECOND_PHASE and get_user_role(current_user_displaying) == "reviewer"
     return audio_val, transcript_to_display, ui_status_message, ui_reviewer_field, ui_color, ui_editable, ui_is_accepted_flag, default_transcript, gr.update(visible=show_accept_checkbox)
 def load_interface_data(page_idx_user_relative, idx_on_page):
-    audio, text, base_status, saved_reviewer_text, color, editable, accepted_flag, original_dataset_text, accept_cb_update = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
     return (
-        page_idx_user_relative,
-        idx_on_page,
-        audio,
-        gr.update(value=text, interactive=editable),
-        gr.update(value=saved_reviewer_text, elem_classes=[color]),
-        base_status,
-        original_dataset_text,
-        accept_cb_update, # For the first_phase_accept_cb visibility
-        accepted_flag # For the first_phase_accept_cb value state
     )
 def navigate_sample(page_idx_user_relative, idx_on_page, direction: int):
@@ -653,9 +691,10 @@ def navigate_sample(page_idx_user_relative, idx_on_page, direction: int):
     if current_page_data is None or len(current_page_data) == 0:
         user_allowed_range = get_user_allowed_range(CURRENT_USERNAME)
         err_msg = "No data loaded. Try reloading or check your assigned range."
-        if not user_allowed_range or user_allowed_range[0] > user_allowed_range[1]:
             err_msg = "You have no samples assigned or your range is invalid."
         return page_idx_user_relative, idx_on_page, None, gr.update(value="Error", interactive=False), gr.update(value="Error"), err_msg, "", gr.update(visible=False), False
@@ -664,45 +703,50 @@ def navigate_sample(page_idx_user_relative, idx_on_page, direction: int):
     new_idx_on_page = target_idx_on_page
     user_allowed_range = get_user_allowed_range(CURRENT_USERNAME)
     if not user_allowed_range:
-        audio, text, status, rev, color, edit, acc_flag, orig_text, cb_vis_update = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
-        return page_idx_user_relative, idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), "Error: No allowed range for navigation.", orig_text, cb_vis_update, acc_flag
-    if target_idx_on_page < 0:
         if page_idx_user_relative > 0:
             new_page_idx_user_relative = page_idx_user_relative - 1
             temp_data = load_page_data(new_page_idx_user_relative)
             if temp_data is not None and not temp_data.empty:
                 new_idx_on_page = len(temp_data) - 1
-            else:
-                audio, text, status, rev, color, edit, acc_flag, orig_text, cb_vis_update = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
-                status = status + " [Already at the first sample of this page/range]"
-                return page_idx_user_relative, idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status, orig_text, cb_vis_update, acc_flag
-        else:
-            audio, text, status, rev, color, edit, acc_flag, orig_text, cb_vis_update = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
-            status = status + " [At the beginning of your assigned samples]"
-            return page_idx_user_relative, idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status, orig_text, cb_vis_update, acc_flag
-    elif target_idx_on_page >= len(current_page_data):
         new_page_idx_user_relative = page_idx_user_relative + 1
         temp_data = load_page_data(new_page_idx_user_relative)
         if temp_data is not None and not temp_data.empty:
             new_idx_on_page = 0
-        else:
             current_abs_idx_check = -1
             if current_page_data is not None and not current_page_data.empty and idx_on_page < len(current_page_data):
                  current_abs_idx_check = current_page_data.iloc[idx_on_page]['absolute_idx']
             is_at_very_end = user_allowed_range and current_abs_idx_check != -1 and current_abs_idx_check >= user_allowed_range[1]
-            audio, text, status, rev, color, edit, acc_flag, orig_text, cb_vis_update = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
             if is_at_very_end:
-                status = status + " [At the end of your assigned samples]"
             else:
-                status = status + " [No more samples in this direction (next page empty or end of assignment)]"
-            return page_idx_user_relative, idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status, orig_text, cb_vis_update, acc_flag
     return load_interface_data(new_page_idx_user_relative, new_idx_on_page)
 def go_next_sample_wrapper(page_idx_user_relative, idx_on_page):
@@ -727,6 +771,11 @@ def review_and_next_sample_second_phase(page_idx_user_relative, idx_on_page, rev
 def jump_to_absolute_idx(target_abs_idx_str, current_page_idx_user_relative, current_idx_on_page):
     global current_page_data
     try:
         target_abs_idx = int(target_abs_idx_str)
@@ -734,63 +783,53 @@ def jump_to_absolute_idx(target_abs_idx_str, current_page_idx_user_relative, cur
         user_allowed_range = get_user_allowed_range(CURRENT_USERNAME)
         if not user_allowed_range or not is_within_range(target_abs_idx, user_allowed_range):
-            status_msg = f"Target index {target_abs_idx} is outside your assigned range {user_allowed_range or 'N/A'}."
-            audio, text, _, rev, color, edit, acc, orig_txt, cb_vis_update = get_sample(current_page_idx_user_relative, current_idx_on_page, CURRENT_USERNAME)
-            return current_page_idx_user_relative, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt, cb_vis_update, acc
         user_start_abs, _ = user_allowed_range
         offset_from_user_start = target_abs_idx - user_start_abs
         if offset_from_user_start < 0:
-            status_msg = f"Logic Error: Target index {target_abs_idx} has negative offset from user start {user_start_abs}."
-            print(status_msg)
-            audio, text, _, rev, color, edit, acc, orig_txt, cb_vis_update = get_sample(current_page_idx_user_relative, current_idx_on_page, CURRENT_USERNAME)
-            return current_page_idx_user_relative, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt, cb_vis_update, acc
         new_user_relative_page_idx = offset_from_user_start // PAGE_SIZE
         temp_page_data_df = load_page_data(new_user_relative_page_idx)
         if temp_page_data_df is None or temp_page_data_df.empty:
-             status_msg = f"No data found for your page {new_user_relative_page_idx} (containing abs index {target_abs_idx})."
-             audio, text, _, rev, color, edit, acc, orig_txt, cb_vis_update = get_sample(current_page_idx_user_relative, current_idx_on_page, CURRENT_USERNAME)
-             return current_page_idx_user_relative, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt, cb_vis_update, acc
-        # current_page_data is now updated by load_page_data
-        matching_rows = current_page_data[current_page_data['absolute_idx'] == target_abs_idx]
-        if not matching_rows.empty:
-            new_idx_on_page_actual = matching_rows.iloc[0]['id_within_page']
-        else:
-            # This can happen if target_abs_idx is within the range of the loaded page, but not exactly matching an item
-            # (e.g., if the dataset is shorter than expected and the page didn't fill up to target_abs_idx)
-            # Or, if target_abs_idx is valid but not the first item on the page.
-            # The id_within_page calculation from offset_from_user_start % PAGE_SIZE might be more direct:
-            new_idx_on_page_actual = offset_from_user_start % PAGE_SIZE
-            if new_idx_on_page_actual >= len(current_page_data): # If calculated index is out of bounds for the loaded page
-                status_msg = f"Index {target_abs_idx} on your page {new_user_relative_page_idx} is out of bounds for loaded data. Displaying start of page."
-                print(status_msg)
-                new_idx_on_page_actual = 0
-                if current_page_data.empty: # Should have been caught by temp_page_data_df check
-                     audio, text, _, rev, color, edit, acc, orig_txt, cb_vis_update = get_sample(current_page_idx_user_relative, current_idx_on_page, CURRENT_USERNAME)
-                     return current_page_idx_user_relative, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg + " (Page empty after load)", orig_txt, cb_vis_update, acc
         return load_interface_data(new_user_relative_page_idx, new_idx_on_page_actual)
     except ValueError:
-        status_msg = "Invalid index format for jump."
     except Exception as e:
         import traceback
-        status_msg = f"Error jumping to index: {str(e)}"
-        print(f"{status_msg}\n{traceback.format_exc()}")
-    # Fallback for errors
-    audio, text, _, rev, color, edit, acc, orig_txt, cb_vis_update = get_sample(current_page_idx_user_relative, current_idx_on_page, CURRENT_USERNAME)
-    return current_page_idx_user_relative, current_idx_on_page, audio, gr.update(value=text, interactive=edit), gr.update(value=rev, elem_classes=[color]), status_msg, orig_txt, cb_vis_update, acc
 def trim_audio_action(page_idx_user_relative, idx_on_page, trim_start_str, trim_end_str):
-    # Helper to format returns, ensuring all outputs for navigation_outputs_extended are present
     def _return_current_state_with_message(msg_suffix):
-        page_idx, current_idx, audio_val, transcript_update, reviewer_update, status_val, orig_text_val, cb_vis_update, cb_val = load_interface_data(page_idx_user_relative, idx_on_page)
-        return page_idx, current_idx, audio_val, transcript_update, reviewer_update, status_val + f" [{msg_suffix}]", orig_text_val, cb_vis_update, cb_val
     if SECOND_PHASE: return _return_current_state_with_message("Trimming disabled in Review Phase.")
@@ -799,36 +838,27 @@ def trim_audio_action(page_idx_user_relative, idx_on_page, trim_start_str, trim_
     actual_sample_info = current_page_data.iloc[idx_on_page]
     absolute_idx = actual_sample_info['absolute_idx']
     original_audio_path_info = get_audio_path(actual_sample_info["audio"])
     source_basename_for_trimmed_file = os.path.basename(str(original_audio_path_info)) if isinstance(original_audio_path_info, str) else f"sample_raw_data_{absolute_idx}"
     audio_seg = None
     temp_dir_for_download = None
     try:
-        if isinstance(original_audio_path_info, tuple): # Raw data (sr, array)
             sr, audio_array = original_audio_path_info
             if not isinstance(audio_array, np.ndarray): return _return_current_state_with_message("Raw audio data is not a numpy array.")
             if audio_array.size == 0: return _return_current_state_with_message("Cannot trim empty audio array.")
-            # Ensure contiguous array and correct channel interpretation (assuming mono or stereo)
             audio_array = np.ascontiguousarray(audio_array)
             channels = 1 if audio_array.ndim == 1 else (audio_array.shape[1] if audio_array.ndim == 2 and audio_array.shape[1] in [1,2] else (audio_array.shape[0] if audio_array.ndim == 2 and audio_array.shape[0] in [1,2] else 0))
             if channels == 0: return _return_current_state_with_message(f"Unsupported audio array shape or channels: {audio_array.shape}")
-            if audio_array.ndim == 2 and audio_array.shape[0] < audio_array.shape[1] and audio_array.shape[0] in [1, 2]: # if channels are rows
-                audio_array = np.ascontiguousarray(audio_array.T) # Transpose to (samples, channels)
-            # Convert to int16 for AudioSegment
             if audio_array.dtype == np.float32 or audio_array.dtype == np.float64: audio_array_int = (audio_array * np.iinfo(np.int16).max).astype(np.int16)
             elif audio_array.dtype == np.int16: audio_array_int = audio_array
-            elif audio_array.dtype == np.int32: audio_array_int = (audio_array >> 16).astype(np.int16) # Approximate if int32
             else: return _return_current_state_with_message(f"Unsupported numpy array dtype for raw audio: {audio_array.dtype}")
             sample_width = audio_array_int.itemsize
             audio_seg = AudioSegment(data=audio_array_int.tobytes(), sample_width=sample_width, frame_rate=sr, channels=channels)
-        elif isinstance(original_audio_path_info, str): # Path string or URL
             audio_to_load = original_audio_path_info
             if not (os.path.exists(audio_to_load) or audio_to_load.startswith("http")): return _return_current_state_with_message("Audio file path is invalid, does not exist, or is not a valid URL.")
             if audio_to_load.startswith("http"):
@@ -841,48 +871,44 @@ def trim_audio_action(page_idx_user_relative, idx_on_page, trim_start_str, trim_
             audio_seg = AudioSegment.from_file(audio_to_load)
         else:
             return _return_current_state_with_message("Trimming not supported for this audio source.")
         if audio_seg is None: return _return_current_state_with_message("Failed to load audio segment.")
         try: start_s, end_s = float(trim_start_str), float(trim_end_str)
         except ValueError: return _return_current_state_with_message("Invalid trim times: Start and End must be numbers.")
         start_ms, end_ms, audio_duration_ms = int(start_s * 1000), int(end_s * 1000), len(audio_seg)
         if not (0 <= start_ms < end_ms and end_ms <= audio_duration_ms):
-             return _return_current_state_with_message(f"Invalid trim times: start={start_s}s, end={end_s}s for audio of {audio_duration_ms/1000.0:.2f}s. Ensure 0 <= start < end <= duration.")
         trimmed_seg = audio_seg[start_ms:end_ms]
         os.makedirs("trimmed_audio", exist_ok=True)
         safe_voice_name = re.sub(r'[^\w.-]', '_', source_basename_for_trimmed_file)
         trimmed_filename = f"trimmed_{absolute_idx}_{safe_voice_name}"
-        if not os.path.splitext(trimmed_filename)[1]: trimmed_filename += ".wav" # Default to wav if no extension
         trimmed_path = os.path.join("trimmed_audio", trimmed_filename)
         export_format = os.path.splitext(trimmed_path)[1][1:].lower() or "wav"
         trimmed_seg.export(trimmed_path, format=export_format)
         dataset_model = load_saved_annotations()
         sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
         if not sample:
             sample = Sample(id=absolute_idx, voice_name=os.path.basename(str(get_audio_path(actual_sample_info["audio"]) or f"sample_{absolute_idx}")),
                             original_subtitle=actual_sample_info["sentence"], annotations=[])
             dataset_model.samples = dataset_model.samples or []
             dataset_model.samples.append(sample)
         now = datetime.now()
         annotation = next((a for a in sample.annotations or [] if a.annotator == CURRENT_USERNAME), None)
         if not annotation:
             annotation = Annotation(annotator=CURRENT_USERNAME, create_at=now, update_at=now)
             sample.annotations = sample.annotations or []
             sample.annotations.append(annotation)
         annotation.audio_trims = [AudioTrim(start=start_s, end=end_s)]
         annotation.update_at = now
         save_annotations(dataset_model)
         # Return full state, but with new audio path and status message
-        page_idx, current_idx, _, transcript_update, reviewer_update, status_val, orig_text_val, cb_vis_update, cb_val = load_interface_data(page_idx_user_relative, idx_on_page)
-        return page_idx, current_idx, trimmed_path, transcript_update, reviewer_update, status_val + " [Trimmed]", orig_text_val, cb_vis_update, cb_val
     except Exception as e:
         import traceback
         print(f"Error during trim_audio_action for abs_idx {absolute_idx}: {str(e)}\n{traceback.format_exc()}")
@@ -893,11 +919,11 @@ def trim_audio_action(page_idx_user_relative, idx_on_page, trim_start_str, trim_
 def undo_trim_action(page_idx_user_relative, idx_on_page):
     def _return_current_state_with_message(msg_suffix):
-        page_idx, current_idx, audio_val, transcript_update, reviewer_update, status_val, orig_text_val, cb_vis_update, cb_val = load_interface_data(page_idx_user_relative, idx_on_page)
-        return page_idx, current_idx, audio_val, transcript_update, reviewer_update, status_val + f" [{msg_suffix}]", orig_text_val, cb_vis_update, cb_val
     if SECOND_PHASE: return _return_current_state_with_message("Undo Trim disabled in Review Phase.")
     if current_page_data is None or idx_on_page < 0 or idx_on_page >= len(current_page_data):
         return _return_current_state_with_message("Audio data not available (page error).")
@@ -910,42 +936,27 @@ def undo_trim_action(page_idx_user_relative, idx_on_page):
             annotation.audio_trims = None
             annotation.update_at = datetime.now()
             save_annotations(dataset_model)
-    # Reload interface to show original audio. get_sample will handle finding the original path.
-    return _return_current_state_with_message("Trim undone")
 def confirm_delete_audio_action(page_idx_user_relative, idx_on_page):
-    def _return_current_state_with_message(msg_suffix): # Not really used here as load_interface_data is called directly
-        page_idx, current_idx, audio_val, transcript_update, reviewer_update, status_val, orig_text_val, cb_vis_update, cb_val = load_interface_data(page_idx_user_relative, idx_on_page)
-        return page_idx, current_idx, audio_val, transcript_update, reviewer_update, status_val + f" [{msg_suffix}]", orig_text_val, cb_vis_update, cb_val
-    if SECOND_PHASE:
-        # Instead of custom message, just call load_interface_data which handles phase display
         loaded_data = load_interface_data(page_idx_user_relative, idx_on_page)
-        return (*loaded_data[0:5], loaded_data[5] + " [Delete disabled in Review Phase]", *loaded_data[6:])
     if current_page_data is None or idx_on_page < 0 or idx_on_page >= len(current_page_data):
-        loaded_data = load_interface_data(page_idx_user_relative, idx_on_page) # Will show error from get_sample
-        return (*loaded_data[0:5], loaded_data[5] + " [Audio data not available (page error)]", *loaded_data[6:])
     absolute_idx = current_page_data.iloc[idx_on_page]['absolute_idx']
     voice_name_original = os.path.basename(str(get_audio_path(current_page_data.iloc[idx_on_page]["audio"]) or f"sample_{absolute_idx}"))
     dataset_model = load_saved_annotations()
     sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
     if not sample:
-        sample = Sample(
-            id=absolute_idx,
-            voice_name=voice_name_original,
-            original_subtitle=current_page_data.iloc[idx_on_page]["sentence"],
-            annotations=[]
-        )
         dataset_model.samples = dataset_model.samples or []
         dataset_model.samples.append(sample)
     sample.ignore_it = True
     now = datetime.now()
     deleted_text_marker = "AUDIO DELETED (This audio has been removed.)"
@@ -959,12 +970,8 @@ def confirm_delete_audio_action(page_idx_user_relative, idx_on_page):
         sample.annotations = sample.annotations or []
         sample.annotations.append(annotation)
     save_annotations(dataset_model)
-    # After deleting, reload the interface for this item. get_sample will handle the "deleted" display.
-    # load_interface_data returns a 9-tuple
-    return load_interface_data(page_idx_user_relative, idx_on_page)
-# Export functions
 def sanitize_string(s):
     if not isinstance(s, str): s = str(s)
     return re.sub(r'[^\w-./]', '_', s)
@@ -979,7 +986,7 @@ def push_to_hub_with_retry(dataset_dict, repo_id, private=True, token_val=None):
         print("Cannot push to hub: No token provided for push_to_hub_with_retry.")
         return
     print(f"Pushing dataset to {repo_id}")
-    dataset_dict.push_to_hub(repo_id, private=private, token=token_val)
 def export_to_huggingface(repo_name_str, hf_token_for_export, progress=gr.Progress()):
     if not hf_token_for_export:
@@ -993,27 +1000,18 @@ def export_to_huggingface(repo_name_str, hf_token_for_export, progress=gr.Progre
         dataset_model_annotations = load_saved_annotations()
-        # Use global total_samples, try to fetch if not set
         current_total_samples = total_samples
         if current_total_samples <= 0:
-            info = get_dataset_info() # This updates global total_samples
             current_total_samples = total_samples
             if current_total_samples <= 0:
                 return "Export failed: Total number of samples is unknown or invalid."
-        ds_source = load_dataset(HF_DATASET_NAME, split="train", streaming=False)
-        # Verify if ds_source.num_rows matches current_total_samples
-        if hasattr(ds_source, 'num_rows') and ds_source.num_rows != current_total_samples:
-            print(f"Warning: Source dataset num_rows ({ds_source.num_rows}) mismatches cached total_samples ({current_total_samples}). Using source num_rows for iteration count if smaller.")
-            # Decide which count to trust or how to reconcile. For now, iterate up to the smaller of the two.
-            # iteration_limit = min(ds_source.num_rows, current_total_samples) if ds_source.num_rows else current_total_samples
-            # For safety, iterate through ds_source and use its length
-            iteration_limit = len(ds_source) # For non-streaming, len() is reliable.
-            if iteration_limit != current_total_samples:
-                 print(f"Adjusting iteration limit for export to {iteration_limit} based on loaded source dataset length.")
-        else:
-            iteration_limit = current_total_samples
         exported_data_list = []
@@ -1023,9 +1021,7 @@ def export_to_huggingface(repo_name_str, hf_token_for_export, progress=gr.Progre
         for i, source_sample in enumerate(ds_source):
             if i >= iteration_limit: break
             num_processed_from_source +=1
             absolute_idx = i
             audio_entry = source_sample.get("audio")
             sentence_val = source_sample.get("sentence", "")
             audio_dict_to_export = audio_entry
@@ -1042,201 +1038,178 @@ def export_to_huggingface(repo_name_str, hf_token_for_export, progress=gr.Progre
                         approved_anns = [a for a in annotation_data.annotations if a.second_phase_review_status == "approved"]
                         if SECOND_PHASE and approved_anns:
                             best_ann = sorted(approved_anns, key=lambda x: x.second_phase_review_timestamp or datetime.min, reverse=True)[0]
                         if not best_ann:
                             accepted_anns = [a for a in annotation_data.annotations if a.is_first_phase_accepted]
                             best_ann = sorted(accepted_anns, key=lambda x: x.update_at, reverse=True)[0] if accepted_anns else None
                         if not best_ann:
                              best_ann = sorted(annotation_data.annotations, key=lambda x: x.update_at, reverse=True)[0]
                     if best_ann:
                         sentence_val = best_ann.annotated_subtitle if best_ann.annotated_subtitle is not None else sentence_val
-                        if best_ann.audio_trims and audio_dict_to_export and isinstance(audio_dict_to_export, dict) and audio_dict_to_export.get("array") is not None:
-                            # Attempt to load and use trimmed audio if path exists
-                            original_audio_path_for_trim_lookup = get_audio_path(audio_entry) # Get original path/info
                             original_voice_name_for_trim = os.path.basename(str(original_audio_path_for_trim_lookup or f"sample_{absolute_idx}"))
                             safe_voice_name_for_trim = re.sub(r'[^\w.-]', '_', original_voice_name_for_trim)
                             trimmed_fname_base = f"trimmed_{absolute_idx}_{safe_voice_name_for_trim}"
                             potential_trimmed_path = os.path.join("trimmed_audio", trimmed_fname_base + ".wav")
                             if os.path.exists(potential_trimmed_path):
                                 try:
-                                    arr, sr = sf.read(potential_trimmed_path)
-                                    audio_dict_to_export = {"array": arr, "sampling_rate": sr}
                                 except Exception as e_read_trim:
-                                    print(f"Warning: Could not read trimmed audio file {potential_trimmed_path} for sample {absolute_idx}: {e_read_trim}. Exporting original/untrimmed.")
-                            else:
-                                print(f"Warning: Trimmed audio file {potential_trimmed_path} not found for sample {absolute_idx}. Exporting original/untrimmed.")
             exported_data_list.append({
                 "audio": audio_dict_to_export,
                 "sentence": sanitize_sentence(sentence_val)
             })
             if (i + 1) % 100 == 0:
                 progress((i + 1) / iteration_limit, f"Processed {i+1}/{iteration_limit} samples")
             gc.collect()
-        if num_processed_from_source != iteration_limit:
-             print(f"Warning: Processed {num_processed_from_source} from source, but iteration_limit was {iteration_limit}.")
-        if not exported_data_list:
-            return "No data to export after processing."
-        for item in exported_data_list:
-            if item["audio"] is None:
-                 item["audio"] = {"array": np.array([], dtype=np.float32), "sampling_rate": 16000}
-            elif isinstance(item["audio"], dict) and 'path' in item["audio"] and item["audio"]['path'] is None:
-                 item["audio"] = {"array": np.array([], dtype=np.float32), "sampling_rate": 16000}
         try:
             final_dataset = Dataset.from_list(exported_data_list)
             final_dataset = final_dataset.cast_column("audio", Audio(sampling_rate=16000))
         except Exception as e_cast:
             print(f"Error during Dataset.from_list or cast_column: {e_cast}")
             for idx_problem, problematic_item in enumerate(exported_data_list[:5]):
-                print(f"Sample item {idx_problem} for export: Audio type {type(problematic_item['audio'])}, Audio content: {str(problematic_item['audio'])[:200]}")
-            return f"Export failed during data conversion: {e_cast}. Check audio data formats."
         dataset_dict_export = DatasetDict({"train": final_dataset})
         progress(0.95, "Uploading to Hugging Face...")
-        # Ensure repo_name_str is just the dataset name part for constructing target_repo_id
-        dataset_name_part = repo_name_str.split('/')[-1]
-        target_repo_id = f"{whoami(token=hf_token_for_export)['name']}/{dataset_name_part}"
-        push_to_hub_with_retry(
-            dataset_dict=dataset_dict_export,
-            repo_id=target_repo_id,
-            private=True,
-            token_val=hf_token_for_export
-        )
         end_time = time.time()
         print(f"Upload done, total time: {end_time - start_time:.2f}s")
         progress(1.0, "Upload complete!")
         return f"Exported to huggingface.co/datasets/{target_repo_id}"
     except Exception as e:
         import traceback
         error_msg = f"Export failed: {str(e)}"
         print(f"{error_msg}\n{traceback.format_exc()}")
         return error_msg
-def hf_login(hf_token_val):
-    global CURRENT_USERNAME, token, current_page_data, total_samples, annotator_ranges, SECOND_PHASE_REVIEW_MAPPING
     failed_login_transcript_update = gr.update(value="", interactive=False)
-    # Define the full tuple structure for failure returns to ensure consistency
-    # page_idx, idx_on_page, audio, transcript_tb(interactive), reviewer_tb, status, original_transcript, cb_vis, cb_val
-    # This corresponds to navigation_outputs_extended (9 items)
-    # Login returns 19 items, so we need to map this carefully.
-    # Outputs for login_button (19 outputs)
-    # login_container, main_container, reviewer_tb (val), hf_token_state, login_message,
-    # save_next_button(vis), transcript_tb(update), trim_button(vis), undo_trim_button(vis), delete_button(vis),
-    # first_phase_accept_cb(update with vis & val),
-    # approve_button(vis), reject_button(vis),
-    # current_page_idx_state(val), current_idx_on_page_state(val), audio_player(val),
-    # transcript_tb(another update, often same as above for value/interactive),
-    # status_md(val), original_transcript_state(val)
     def _failed_login_outputs(login_msg_text, reviewer_text_val="N/A"):
         return (
             gr.update(visible=True), gr.update(visible=False),  # login_container, main_container
-            gr.update(value=reviewer_text_val), hf_token_val, login_msg_text,  # reviewer_tb, hf_token_state, login_message
-            gr.update(visible=False),  # save_next_button
-            failed_login_transcript_update, # transcript_tb (interactive part)
-            gr.update(visible=False),  # trim_button
-            gr.update(visible=False),  # undo_trim_button
-            gr.update(visible=False),  # delete_button
             gr.update(visible=False, value=False),  # first_phase_accept_cb (vis & val)
-            gr.update(visible=False),  # approve_button
-            gr.update(visible=False),  # reject_button
-            0, 0, None,  # page_idx_state, idx_on_page_state, audio_player
-            failed_login_transcript_update, # transcript_tb (value part)
             login_msg_text if "failed" in login_msg_text.lower() or "error" in login_msg_text.lower() else "Please log in.", # status_md
             "" # original_transcript_state
         )
-    if not hf_token_val:
         return _failed_login_outputs("Login failed: Token cannot be empty.")
     try:
-        user_info = whoami(token=hf_token_val)
         username = user_info['name']
         if username in ALLOWED_USERS:
             CURRENT_USERNAME = username
-            token = hf_token_val # Update global token if login is successful
-            # Crucial: Re-fetch dataset info and ranges AFTER successful login,
-            # as these might depend on user context or be freshly needed.
-            ds_info = get_dataset_info() # This updates global total_samples
             if total_samples <= 0:
-                return _failed_login_outputs(f"Login OK for {CURRENT_USERNAME}, but failed to get dataset size. Cannot proceed.", reviewer_text_val="Error")
-            # Recalculate ranges and assignments based on potentially new total_samples
             annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
             if SECOND_PHASE:
-                # Re-initialize second phase assignments as total_samples or ANNOTATORS might have been reset/updated
-                SECOND_PHASE_REVIEW_MAPPING.clear() # Clear previous mapping before re-initializing
-                initialize_second_phase_assignments()
             user_allowed_range_check = get_user_allowed_range(CURRENT_USERNAME)
             if not user_allowed_range_check or user_allowed_range_check[0] > user_allowed_range_check[1]:
-                 return _failed_login_outputs(f"Login OK for {CURRENT_USERNAME}, but no samples assigned for {'review' if SECOND_PHASE else 'annotation'}.", reviewer_text_val="No Samples")
-            # Load page 0 data for the logged-in user
-            current_page_data = load_page_data(0) # This sets global current_page_data and current_page
-            is_second_phase_active = SECOND_PHASE
-            # Load interface data for the first sample (page 0, index 0)
-            # load_interface_data returns a 9-tuple:
-            # (page_idx, idx_on_page, audio, transcript_update, reviewer_update, status, orig_text, cb_vis_update, cb_val)
-            initial_load_tuple = load_interface_data(current_page, 0 if not current_page_data.empty else 0)
             # Structure for login_outputs (19 items)
             return (
-                gr.update(visible=False),  # login_container
-                gr.update(visible=True),   # main_container
-                initial_load_tuple[4],     # reviewer_tb (gr.update obj)
-                hf_token_val,              # hf_token_state (value)
-                f"Login successful! Welcome {CURRENT_USERNAME}. Phase: {'Review' if is_second_phase_active else 'Annotation'}.", # login_message
-                gr.update(visible=not is_second_phase_active),  # save_next_button (visibility)
-                initial_load_tuple[3],     # transcript_tb (gr.update obj for value and interactivity)
-                gr.update(visible=not is_second_phase_active),  # trim_button (visibility)
-                gr.update(visible=not is_second_phase_active),  # undo_trim_button (visibility)
-                gr.update(visible=not is_second_phase_active),  # delete_button (visibility)
-                gr.update(visible=initial_load_tuple[7]['visible'], value=initial_load_tuple[8]), # first_phase_accept_cb (gr.update obj for vis and val)
-                gr.update(visible=is_second_phase_active),  # approve_button (visibility)
-                gr.update(visible=is_second_phase_active),  # reject_button (visibility)
-                initial_load_tuple[0],     # current_page_idx_state (value)
-                initial_load_tuple[1],     # current_idx_on_page_state (value)
-                initial_load_tuple[2],     # audio_player (value or gr.update obj)
-                initial_load_tuple[3],     # transcript_tb (duplicate for Gradio's multiple output handling if needed, maps to same component)
-                initial_load_tuple[5],     # status_md (value)
-                initial_load_tuple[6]      # original_transcript_state (value)
             )
-        else: # User not in ALLOWED_USERS
             CURRENT_USERNAME = None
-            return _failed_login_outputs("User not authorized!", reviewer_text_val="Unauthorized")
     except Exception as e:
         CURRENT_USERNAME = None
         import traceback
         login_err_msg = f"Login failed: {str(e)}"
         print(f"{login_err_msg}\n{traceback.format_exc()}")
         return _failed_login_outputs(login_err_msg, reviewer_text_val="Login Error")
-# Gradio Interface
 css = """
 .white { background-color: white; color: black; } .yellow { background-color: yellow; color: black; }
 .blue { background-color: lightblue; color: black; } .green { background-color: lightgreen; color: black; }
@@ -1246,14 +1219,19 @@ css = """
 .reviewer-textbox input { text-align: center; font-weight: bold; }
 """
 with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
-    hf_token_state = gr.State(token) # Store token if passed via env
     current_page_idx_state = gr.State(0)
     current_idx_on_page_state = gr.State(0)
     original_transcript_state = gr.State("")
     with gr.Column(visible=True, elem_id="login_container") as login_container:
         gr.Markdown("## HF Authentication")
-        hf_token_input = gr.Textbox(label="Hugging Face Token", type="password", value=token or "") # Pre-fill if env var set
         login_button = gr.Button("Login")
         login_message = gr.Markdown("")
@@ -1291,30 +1269,33 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
                 jump_text_tb = gr.Textbox(label="Jump to Global Index", placeholder="Enter dataset absolute index")
                 jump_button = gr.Button("Jump")
             with gr.Row():
-                default_repo_name = f"your-username/asr-dataset" # Will be updated on login if user context known
-                hf_repo_name_tb = gr.Textbox(label="Export Repository Name (username/dataset-name)", value=default_repo_name)
                 hf_export_button = gr.Button("Export to Hugging Face", variant="primary")
             hf_export_status_md = gr.Markdown("")
     # Outputs for login_button (19 outputs)
     login_outputs = [
-        login_container, main_container, reviewer_tb, hf_token_state, login_message,
-        save_next_button, transcript_tb, trim_button, undo_trim_button, delete_button,
-        first_phase_accept_cb,
-        approve_button, reject_button,
-        current_page_idx_state, current_idx_on_page_state, audio_player, transcript_tb,
-        status_md, original_transcript_state
     ]
     login_button.click(fn=hf_login, inputs=[hf_token_input], outputs=login_outputs)
-    # Common outputs for navigation and actions that reload sample view (9 outputs)
-    # page_idx_state, idx_on_page_state, audio_player, transcript_tb (update), reviewer_tb (update),
-    # status_md, original_transcript_state, first_phase_accept_cb (update for vis), first_phase_accept_cb (update for val)
     navigation_outputs_extended = [
-        current_page_idx_state, current_idx_on_page_state,
-        audio_player, transcript_tb, reviewer_tb, status_md, original_transcript_state,
-        first_phase_accept_cb, # For visibility update from get_sample via load_interface_data
-        first_phase_accept_cb  # For value update from get_sample via load_interface_data
     ]
     save_next_button.click(
@@ -1332,7 +1313,6 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
         inputs=[current_page_idx_state, current_idx_on_page_state],
         outputs=navigation_outputs_extended
     )
     approve_button.click(
         fn=review_and_next_sample_second_phase,
         inputs=[current_page_idx_state, current_idx_on_page_state, gr.State("approved")],
@@ -1343,7 +1323,6 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
         inputs=[current_page_idx_state, current_idx_on_page_state, gr.State("rejected")],
         outputs=navigation_outputs_extended
     )
     trim_button.click(
         fn=trim_audio_action,
         inputs=[current_page_idx_state, current_idx_on_page_state, trim_start_tb, trim_end_tb],
@@ -1357,9 +1336,8 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
     delete_button.click(
         fn=confirm_delete_audio_action,
         inputs=[current_page_idx_state, current_idx_on_page_state],
-        outputs=navigation_outputs_extended # confirm_delete now returns the 9-tuple
     )
     jump_button.click(
         fn=jump_to_absolute_idx,
         inputs=[jump_text_tb, current_page_idx_state, current_idx_on_page_state],
@@ -1367,33 +1345,32 @@ with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
     )
     hf_export_button.click(
         fn=export_to_huggingface,
-        inputs=[hf_repo_name_tb, hf_token_state],
         outputs=[hf_export_status_md],
         queue=True
     )
 if __name__ == "__main__":
-    # Global config overrides for testing can be placed here if needed
-    # Example: SECOND_PHASE = True
-    # Initializations based on configuration (run once at start)
-    if total_samples <= 0: # If get_dataset_info() failed or wasn't run effectively before this
-        print("Main block: total_samples not positive. Attempting to get dataset info again.")
-        dataset_info = get_dataset_info() # Sets global total_samples
-        if total_samples > 0:
-            annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
-        else:
-            print("Main block: Still no total_samples. Ranges will be empty, app might not function fully.")
     if SECOND_PHASE:
         print("==== APPLICATION LAUNCHING IN SECOND PHASE (REVIEW MODE) ====")
-        if not SECOND_PHASE_REVIEW_MAPPING and total_samples > 0 and ANNOTATORS:
-            print("Main block: Initializing second phase assignments...")
-            if not annotator_ranges: # Should be populated if total_samples is known
-                 annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
-            initialize_second_phase_assignments()
-        elif not SECOND_PHASE_REVIEW_MAPPING:
-            print("Warning (Main block): Second phase active, but review mapping is empty. Check total_samples and ANNOTATORS list.")
     else:
         print("==== APPLICATION LAUNCHING IN FIRST PHASE (ANNOTATION MODE) ====")

 import json
 import pandas as pd
 from datasets import load_dataset, DatasetDict, Dataset, Audio
+from huggingface_hub import HfApi, whoami, login, hf_hub_download, HfHubHTTPError
 import tempfile
 import shutil
 import gc
 from pydantic import BaseModel
 from typing import Optional, List, Tuple
 from datetime import datetime
+import requests
 # Log in with Hugging Face token
 token = os.getenv("hf_token")
 if token:
+    try:
+        login(token)
+        print("Successfully logged in using hf_token environment variable.")
+    except Exception as e:
+        print(f"Failed to login with hf_token environment variable: {e}")
+        token = None # Ensure token is None if login fails
 else:
+    print("Warning: hf_token environment variable not set. Hugging Face Hub operations might fail unless token is provided via UI.")
 # Configuration
 HF_DATASET_NAME = "navidved/channelb-raw-data"
+AUDIO_DIR = "audio"
+SAVE_PATH = "annotations.json" # Local filename for annotations
 ALLOWED_USERS = ["shahab7", "amirnamini23", "Mohsen711", "mahya2025", "najmeh00", "sepehr21ar", "zahraemarati", "moghim72", "amin76", "vargha", "navidved"]
 REVIEWERS = ["vargha", "navidved"]
 ANNOTATORS = [user for user in ALLOWED_USERS if user not in REVIEWERS]
 CURRENT_USERNAME = None
 PAGE_SIZE = 100
+# SAVE_INTERVAL = 1 # FOR DEBUGGING: PUSH ON EVERY SAVE
+SAVE_INTERVAL = 10 # Normal operation: push every 10 saves
 # --- SECOND PHASE CONFIGURATION ---
+SECOND_PHASE = False
+SECOND_PHASE_REVIEW_MAPPING = {}
 # Global state variables
+current_page = 0
+current_page_data = None
+audio_backup = {}
+annotation_count = 0 # Counts saves since login for the current session
 unsaved_changes = {}
+total_samples = 0
+annotator_ranges = {}
 # Pydantic data models
 class AudioTrim(BaseModel):
     update_at: datetime
 class Sample(BaseModel):
+    id: int
     voice_name: str
     original_subtitle: str
     ignore_it: bool = False
 def load_saved_annotations():
     dataset_model = None
     local_file_loaded_successfully = False
+    annotations_filename_in_repo = os.path.basename(SAVE_PATH) # e.g., "annotations.json"
     if os.path.exists(SAVE_PATH):
         try:
             with open(SAVE_PATH, "r", encoding="utf-8") as f:
                 data = json.load(f)
+            if "samples" in data or not data:
+                dataset_model = DatasetModel(**data)
+                print(f"Loaded annotations from local JSON file: {SAVE_PATH}")
+                local_file_loaded_successfully = True
+            else:
+                print(f"Local JSON file {SAVE_PATH} has incorrect structure. Ignoring.")
         except Exception as e:
+            print(f"Error loading local JSON file '{SAVE_PATH}': {str(e)}. Will try HF Hub or create new.")
             try:
                 corrupt_path = SAVE_PATH + ".corrupt." + datetime.now().strftime("%Y%m%d%H%M%S%f")
                 os.rename(SAVE_PATH, corrupt_path)
                 print(f"Renamed corrupt local file to {corrupt_path}")
             except OSError as re_e:
                 print(f"Could not rename corrupt local file: {re_e}")
+    global token # Access the global token, which should be set by hf_login
     if not local_file_loaded_successfully and token:
+        print(f"Local annotations not loaded or not found/corrupt. Trying Hugging Face Hub for {annotations_filename_in_repo}...")
         try:
             hf_path = hf_hub_download(
                 repo_id=HF_DATASET_NAME,
+                filename=annotations_filename_in_repo,
                 repo_type="dataset",
                 token=token
             )
             with open(hf_path, "r", encoding="utf-8") as f:
                 data = json.load(f)
+            dataset_model = DatasetModel(**data)
             with open(SAVE_PATH, "w", encoding="utf-8") as f_cache:
                 f_cache.write(dataset_model.model_dump_json(exclude_none=True, indent=4))
+            print(f"Loaded annotations from HF '{HF_DATASET_NAME}/{annotations_filename_in_repo}' and cached to '{SAVE_PATH}'.")
+        except HfHubHTTPError as e:
+            if e.response.status_code == 404:
+                print(f"Annotations file '{annotations_filename_in_repo}' not found on HF repo '{HF_DATASET_NAME}'. This is normal if it's the first run or not pushed yet.")
+            else:
+                print(f"Error loading JSON file from HF repo '{HF_DATASET_NAME}/{annotations_filename_in_repo}': {str(e)}")
+        except Exception as e:
+            print(f"Unexpected error loading JSON file from HF repo '{HF_DATASET_NAME}/{annotations_filename_in_repo}': {str(e)}")
     if dataset_model is None:
         print("No valid annotations found locally or on HF Hub (or failed to load). Creating new empty DatasetModel.")
         dataset_model = DatasetModel(samples=[])
     return dataset_model
 def push_json_to_hf():
+    global token # Use the globally set token from hf_login
+    annotations_filename_in_repo = os.path.basename(SAVE_PATH)
     if not token:
+        print("Push to HF: Aborted. Token not available/set.")
+        return
+    print(f"Push to HF: Attempting to upload '{SAVE_PATH}' as '{annotations_filename_in_repo}' to '{HF_DATASET_NAME}'.")
+    try:
+        user_details = whoami(token=token)
+        print(f"Push to HF: Token confirmed for user '{user_details.get('name')}'.")
+    except Exception as e_whoami:
+        print(f"Push to HF: Token seems invalid or whoami failed. Error: {e_whoami}")
+        print(f"Push to HF: Aborting upload due to token validation issue.")
         return
     try:
         api = HfApi()
         api.upload_file(
+            path_or_fileobj=SAVE_PATH, # Local path to the file
+            path_in_repo=annotations_filename_in_repo, # Name of the file in the repository
             repo_type="dataset",
             repo_id=HF_DATASET_NAME,
+            token=token,
+            commit_message=f"Updated {annotations_filename_in_repo} via annotation tool at {datetime.now().isoformat()}"
         )
+        print(f"Push to HF: Successfully uploaded '{annotations_filename_in_repo}' to Hugging Face repository '{HF_DATASET_NAME}'.")
+    except Exception as e:
+        print(f"Push to HF: Error uploading '{annotations_filename_in_repo}' to '{HF_DATASET_NAME}'. Error: {str(e)}")
+        import traceback
+        print("Push to HF: Traceback below:")
+        traceback.print_exc()
+def save_annotations(dataset_model: DatasetModel):
+    global annotation_count, token # Make sure we're using the global token
+    # DEBUGGING PRINT
+    print(f"Debug (save_annotations): annotation_count (before inc)={annotation_count}, SAVE_INTERVAL={SAVE_INTERVAL}, token_is_truthy={bool(token)}")
+    try:
+        with open(SAVE_PATH, "w", encoding="utf-8") as f:
+            f.write(dataset_model.model_dump_json(exclude_none=True, indent=4))
+        print(f"Saved annotations locally to {SAVE_PATH}")
+        annotation_count += 1 # Increment after successful local save
+        if token and (annotation_count % SAVE_INTERVAL == 0):
+            print(f"Debug (save_annotations): Conditions met for HF push. Current annotation_count={annotation_count}.")
+            push_json_to_hf()
+        elif not token:
+            print(f"Debug (save_annotations): HF push skipped. Token is not available. annotation_count={annotation_count}.")
+        else: # Token is available, but interval not met
+            print(f"Debug (save_annotations): HF push skipped. Interval not met. annotation_count={annotation_count}. "
+                  f"Need {(SAVE_INTERVAL - (annotation_count % SAVE_INTERVAL)) % SAVE_INTERVAL} more saves for next push (or 0 if at interval).")
     except Exception as e:
+        print(f"Error in save_annotations (local save or triggering push): {str(e)}")
+        import traceback
+        print("Traceback for save_annotations error:")
+        traceback.print_exc()
 def calculate_annotator_ranges(total_samples_val, annotators_list):
     num_annotators = len(annotators_list)
             end_idx += 1
         if end_idx >= total_samples_val:
             end_idx = total_samples_val -1
+        if start_idx <= end_idx:
              ranges[annotator] = (start_idx, end_idx)
         start_idx = end_idx + 1
     print(f"Calculated annotator ranges: {ranges}")
         SECOND_PHASE_REVIEW_MAPPING[annotator] = annotator
         print(f"Second phase: {annotator} will review their own work.")
     else:
+        for i, reviewer_user in enumerate(ANNOTATORS):
             original_annotator_idx = (i - 1 + len(ANNOTATORS)) % len(ANNOTATORS)
             original_annotator_user = ANNOTATORS[original_annotator_idx]
             SECOND_PHASE_REVIEW_MAPPING[reviewer_user] = original_annotator_user
             print(f"Warning: Original annotator {original_annotator} (being reviewed by {reviewer}) has no range defined in annotator_ranges.")
 def get_user_allowed_range(username):
+    global annotator_ranges, total_samples, ANNOTATORS # Ensure ANNOTATORS is accessible
     if SECOND_PHASE:
+        if not SECOND_PHASE_REVIEW_MAPPING: # If empty, try to initialize
+            # Need annotator_ranges for initialize_second_phase_assignments
+            if not annotator_ranges and total_samples > 0 and ANNOTATORS:
+                annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
+            initialize_second_phase_assignments() # This will populate SECOND_PHASE_REVIEW_MAPPING
         original_annotator_to_review = SECOND_PHASE_REVIEW_MAPPING.get(username)
         if original_annotator_to_review:
+            # Ensure annotator_ranges is populated if it wasn't before
+            if not annotator_ranges and total_samples > 0 and ANNOTATORS:
                  annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
             user_range = annotator_ranges.get(original_annotator_to_review)
             return user_range
+        else: # User not found in review mapping (e.g., a first-phase reviewer not part of ANNOTATORS cycle)
+            return None # Or handle as appropriate, e.g., full range if they are a super-reviewer
     else: # First Phase Logic
         if get_user_role(username) == "reviewer":
             return (0, total_samples - 1) if total_samples > 0 else None
+        # Ensure annotator_ranges is populated for annotators
+        elif not annotator_ranges and total_samples > 0 and ANNOTATORS:
+            annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
+        if username in annotator_ranges:
             return annotator_ranges[username]
         else:
             return None
     if total_samples > 0:
         return {'num_samples': total_samples}
     try:
+        print(f"Attempting to load dataset info for {HF_DATASET_NAME} (non-streaming)...")
         ds_info_obj = load_dataset(HF_DATASET_NAME, split="train", streaming=False)
         num_samples_val = ds_info_obj.num_rows
         if num_samples_val and num_samples_val > 0:
             print(f"Dataset info: total_samples set to {total_samples}")
             return {'num_samples': total_samples}
         else:
+            print(f"Warning: ds_info_obj.num_rows was not positive ({num_samples_val}). Trying iteration for count (may be slow).")
             ds_stream = load_dataset(HF_DATASET_NAME, split="train", streaming=True)
             count = 0
+            for _ in ds_stream: # This will iterate over the whole dataset if num_rows is wrong
                 count +=1
+                if count % 10000 == 0: print(f"Counting by iteration... at {count}") # Progress for large datasets
             if count > 0:
                 total_samples = count
                 print(f"Dataset info: total_samples set to {total_samples} by iteration.")
                 total_samples = -1
                 return {'num_samples': -1}
     except Exception as e:
+        print(f"Error getting dataset info for {HF_DATASET_NAME}: {e}")
         total_samples = -1
         return {'num_samples': -1}
+# Initial data load attempt (will be re-attempted more robustly in hf_login)
+# dataset_info = get_dataset_info()
+# if total_samples > 0:
+#     annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
+#     if SECOND_PHASE:
+#         initialize_second_phase_assignments()
+# else:
+#     print("Initial check: total_samples is not positive. Will rely on login process to set this.")
+#     annotator_ranges = {}
 def get_audio_path(audio_entry):
     if isinstance(audio_entry, dict):
         return audio_entry
     return None
 def load_page_data(page_num_within_user_view=0):
     global current_page_data, current_page
     current_page_data = pd.DataFrame(columns=["audio", "sentence", "id_within_page", "absolute_idx"])
     current_page = page_num_within_user_view
         print(f"User {CURRENT_USERNAME} has an invalid allowed range: {user_allowed_range}")
         return current_page_data
     page_global_start_idx = user_start_abs + (page_num_within_user_view * PAGE_SIZE)
     if page_global_start_idx > user_end_abs:
         print(f"Requested page {page_num_within_user_view} (abs start {page_global_start_idx}) is beyond user {CURRENT_USERNAME}'s allowed samples end ({user_end_abs}).")
+        return current_page_data
     page_global_end_idx = min(page_global_start_idx + PAGE_SIZE - 1, user_end_abs)
     num_samples_on_this_page = page_global_end_idx - page_global_start_idx + 1
     if num_samples_on_this_page <= 0:
         print(f"No samples for user {CURRENT_USERNAME} on their page {page_num_within_user_view}. Calculated range for page: [{page_global_start_idx}-{page_global_end_idx}]")
+        return current_page_data
     print(f"Loading page {page_num_within_user_view} for user {CURRENT_USERNAME}. "
           f"Effective absolute dataset range for this page: [{page_global_start_idx}-{page_global_end_idx}] "
           f"Will attempt to load {num_samples_on_this_page} samples.")
     try:
+        ds_full = load_dataset(HF_DATASET_NAME, split="train", streaming=True, token=token if token else None) # Use token for private datasets
         ds_page_specific = ds_full.skip(page_global_start_idx)
         page_iterable = ds_page_specific.take(num_samples_on_this_page)
     except Exception as e:
         print(f"Error loading or processing dataset via skip/take for page data: {e}")
         return current_page_data
     samples_on_page_list = []
     current_processing_abs_idx = page_global_start_idx
     for id_on_page_counter, sample_data_item in enumerate(page_iterable):
         sample_data_item['absolute_idx'] = current_processing_abs_idx
+        sample_data_item['id_within_page'] = id_on_page_counter
         samples_on_page_list.append(sample_data_item)
         current_processing_abs_idx += 1
         if id_on_page_counter + 1 >= num_samples_on_this_page:
             break
               f"First abs_idx: {samples_on_page_list[0]['absolute_idx']}, "
               f"Last abs_idx: {samples_on_page_list[-1]['absolute_idx']}.")
     else:
         print(f"No samples were loaded for page {page_num_within_user_view} (user: {CURRENT_USERNAME}) "
+              f"despite expecting {num_samples_on_this_page} from range [{page_global_start_idx}-{page_global_end_idx}]. ")
     gc.collect()
     return current_page_data
+# Core functions (save_sample_data, handle_second_phase_action, get_sample, load_interface_data, navigation functions, jump, trim, export etc. remain largely the same as your previous version)
+# ... (Keep the rest of your functions from the previous version here)
+# For brevity, I'm omitting the bulk of the functions that were not directly related to the HF save issue or initial loading.
+# Make sure to include:
+# - save_sample_data
+# - handle_second_phase_action
+# - get_sample
+# - load_interface_data
+# - navigate_sample and its wrappers
+# - jump_to_absolute_idx
+# - trim_audio_action, undo_trim_action, confirm_delete_audio_action
+# - export_to_huggingface
+# - hf_login (ensure it correctly calls get_dataset_info, calculate_annotator_ranges, load_page_data, etc. *after* successful auth)
 def save_sample_data(page_idx, idx_on_page, transcript, current_user_performing_action, accepted_flag=False):
     global current_page_data, unsaved_changes
     audio_entry_original = actual_sample_info["audio"]
     voice_name = os.path.basename(str(get_audio_path(audio_entry_original) or f"sample_{absolute_idx}"))
+    dataset_model = load_saved_annotations() # This will load existing or create new
     sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
     if not sample:
     if absolute_idx in unsaved_changes:
         del unsaved_changes[absolute_idx]
+    save_annotations(dataset_model) # This will save locally and potentially push to HF
     return f"✓ Saved annotation for sample {absolute_idx}"
 def handle_second_phase_action(page_idx, idx_on_page, action: str):
         print(f"Warning: No prior annotation by {original_annotator_to_review} for sample {absolute_idx}. Creating placeholder for review.")
         annotation_to_review = Annotation(
             annotator=original_annotator_to_review,
+            annotated_subtitle=sample.original_subtitle, # Or actual_sample_info["sentence"]
             create_at=datetime.now(),
             update_at=datetime.now()
         )
     if action == "approved":
         sample.is_approved_in_second_phase = True
+    # else: sample.is_approved_in_second_phase = False # Explicitly set to False on rejection
     save_annotations(dataset_model)
     return f"✓ Review ({action}) saved for sample {absolute_idx} (Original annotator: {original_annotator_to_review})"
     global current_page_data, total_samples
     if current_page_data is None or idx_on_page < 0 or idx_on_page >= len(current_page_data):
+        # Default empty values for all expected return items
         return None, "", f"Invalid index ({idx_on_page}) for current page data (len {len(current_page_data) if current_page_data is not None else 'None'}).", "unreviewed", "white", True, False, "", gr.update(visible=False)
     actual_sample_info = current_page_data.iloc[idx_on_page]
     audio_entry_original = actual_sample_info["audio"]
     audio_val = get_audio_path(audio_entry_original)
+    default_transcript = actual_sample_info.get("sentence", "") # Use .get for safety
     transcript_to_display = default_transcript
     ui_reviewer_field = "unreviewed"
     ui_color = "white"
     ui_editable = True
+    ui_is_accepted_flag = False
     status_prefix = ""
     user_allowed_range = get_user_allowed_range(current_user_displaying)
     if user_allowed_range:
         user_start_abs, user_end_abs = user_allowed_range
+        # Ensure user_start_abs is valid before calculation
+        if user_start_abs is not None and absolute_idx >= user_start_abs :
+            current_sample_num_in_user_assignment = absolute_idx - user_start_abs + 1
+            total_samples_for_user = user_end_abs - user_start_abs + 1
+            status_prefix = f"Sample {current_sample_num_in_user_assignment} of {total_samples_for_user} for you (Abs Idx {absolute_idx})."
+        else: # Fallback if range is odd or absolute_idx is somehow outside
+            status_prefix = f"Sample (Abs Idx {absolute_idx}). Range issue for user stats."
     else:
+        status_prefix = f"Sample (Abs Idx {absolute_idx}). No range assigned."
     dataset_model = load_saved_annotations()
     sample_from_json = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
                 if annotation_under_review:
                     transcript_to_display = annotation_under_review.annotated_subtitle or default_transcript
                     ui_is_accepted_flag = (annotation_under_review.second_phase_review_status == "approved" and
                                           annotation_under_review.second_phase_reviewed_by == current_user_displaying)
                             ui_color = "gray"
                             ui_reviewer_field += f" (Already reviewed by {annotation_under_review.second_phase_reviewed_by} as {annotation_under_review.second_phase_review_status})"
                     else:
+                        ui_color = "yellow"
                 else:
                     transcript_to_display = default_transcript
                     ui_reviewer_field += " (No submission by original annotator)"
                 transcript_to_display = accepted_first_phase_annotation.annotated_subtitle or default_transcript
                 ui_reviewer_field = f"Accepted by: {accepted_first_phase_annotation.first_phase_reviewer_username}"
                 ui_color = "green"
+                ui_is_accepted_flag = True
                 ui_editable = (get_user_role(current_user_displaying) == "reviewer")
             else:
                 user_specific_annotation = next((a for a in sample_from_json.annotations or [] if a.annotator == current_user_displaying), None)
                     ui_reviewer_field = f"Your draft (as {user_specific_annotation.annotator})"
                     ui_color = "yellow"
                     ui_editable = True
+                else:
                     other_annotations = [a for a in sample_from_json.annotations or [] if not a.is_first_phase_accepted]
                     if other_annotations:
                         if get_user_role(current_user_displaying) == "reviewer":
+                            other_ann_to_show = other_annotations[0]
                             transcript_to_display = other_ann_to_show.annotated_subtitle or default_transcript
                             ui_reviewer_field = f"Draft by: {other_ann_to_show.annotator}"
                             ui_color = "blue"
                             ui_editable = True
+                        else:
                             transcript_to_display = default_transcript
                             ui_reviewer_field = f"Labeled by: {other_annotations[0].annotator}"
                             ui_color = "lightblue"
+                            ui_editable = False
     if not SECOND_PHASE and absolute_idx in unsaved_changes:
         ui_color = "pink"
     else:
         ui_status_message += " (Annotation Phase)"
     show_accept_checkbox = not SECOND_PHASE and get_user_role(current_user_displaying) == "reviewer"
     return audio_val, transcript_to_display, ui_status_message, ui_reviewer_field, ui_color, ui_editable, ui_is_accepted_flag, default_transcript, gr.update(visible=show_accept_checkbox)
 def load_interface_data(page_idx_user_relative, idx_on_page):
+    # get_sample returns 9 items
+    audio, text, base_status, saved_reviewer_text, color, editable, accepted_flag, original_dataset_text, accept_cb_visibility_update = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
     return (
+        page_idx_user_relative, # 0
+        idx_on_page,            # 1
+        audio,                  # 2
+        gr.update(value=text, interactive=editable), # 3 transcript_tb
+        gr.update(value=saved_reviewer_text, elem_classes=[color]), # 4 reviewer_tb
+        base_status,            # 5 status_md
+        original_dataset_text,  # 6 original_transcript_state
+        accept_cb_visibility_update, # 7 first_phase_accept_cb (visibility part)
+        accepted_flag           # 8 first_phase_accept_cb (value part)
     )
 def navigate_sample(page_idx_user_relative, idx_on_page, direction: int):
     if current_page_data is None or len(current_page_data) == 0:
         user_allowed_range = get_user_allowed_range(CURRENT_USERNAME)
         err_msg = "No data loaded. Try reloading or check your assigned range."
+        if not user_allowed_range or (user_allowed_range[0] > user_allowed_range[1]): # check for invalid range
             err_msg = "You have no samples assigned or your range is invalid."
+        # Return a 9-tuple consistent with load_interface_data's structure
         return page_idx_user_relative, idx_on_page, None, gr.update(value="Error", interactive=False), gr.update(value="Error"), err_msg, "", gr.update(visible=False), False
     new_idx_on_page = target_idx_on_page
     user_allowed_range = get_user_allowed_range(CURRENT_USERNAME)
+    # This check should ideally not be hit if current_page_data exists, but good safeguard
     if not user_allowed_range:
+        # Use get_sample to fetch current state with an error message
+        current_state = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
+        # current_state is a 9-tuple: (audio, text, status, rev, color, edit, acc_flag, orig_text, cb_vis_update)
+        return page_idx_user_relative, idx_on_page, current_state[0], gr.update(value=current_state[1], interactive=current_state[5]), gr.update(value=current_state[3], elem_classes=[current_state[4]]), "Error: No allowed range for navigation.", current_state[7], current_state[8], current_state[6]
+    if target_idx_on_page < 0: # Moving to previous page or beginning of assignment
         if page_idx_user_relative > 0:
             new_page_idx_user_relative = page_idx_user_relative - 1
             temp_data = load_page_data(new_page_idx_user_relative)
             if temp_data is not None and not temp_data.empty:
                 new_idx_on_page = len(temp_data) - 1
+            else: # Previous page is empty (shouldn't happen if logic is correct)
+                current_state = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
+                status = current_state[2] + " [Already at the first sample of this page/range]"
+                return page_idx_user_relative, idx_on_page, current_state[0], gr.update(value=current_state[1], interactive=current_state[5]), gr.update(value=current_state[3], elem_classes=[current_state[4]]), status, current_state[7], current_state[8], current_state[6]
+        else: # Already on first item of first user-relative page
+            current_state = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
+            status = current_state[2] + " [At the beginning of your assigned samples]"
+            return page_idx_user_relative, idx_on_page, current_state[0], gr.update(value=current_state[1], interactive=current_state[5]), gr.update(value=current_state[3], elem_classes=[current_state[4]]), status, current_state[7], current_state[8], current_state[6]
+    elif target_idx_on_page >= len(current_page_data): # Moving to next page or end of assignment
         new_page_idx_user_relative = page_idx_user_relative + 1
         temp_data = load_page_data(new_page_idx_user_relative)
         if temp_data is not None and not temp_data.empty:
             new_idx_on_page = 0
+        else: # Next user-relative page is empty (means we are at the end of user's allowed samples)
             current_abs_idx_check = -1
             if current_page_data is not None and not current_page_data.empty and idx_on_page < len(current_page_data):
                  current_abs_idx_check = current_page_data.iloc[idx_on_page]['absolute_idx']
             is_at_very_end = user_allowed_range and current_abs_idx_check != -1 and current_abs_idx_check >= user_allowed_range[1]
+            current_state = get_sample(page_idx_user_relative, idx_on_page, CURRENT_USERNAME)
+            status = current_state[2]
             if is_at_very_end:
+                status += " [At the end of your assigned samples]"
             else:
+                status += " [No more samples in this direction (next page empty or end of assignment)]"
+            return page_idx_user_relative, idx_on_page, current_state[0], gr.update(value=current_state[1], interactive=current_state[5]), gr.update(value=current_state[3], elem_classes=[current_state[4]]), status, current_state[7], current_state[8], current_state[6]
+    # If navigation is within the current page or to a new valid page/index
     return load_interface_data(new_page_idx_user_relative, new_idx_on_page)
 def go_next_sample_wrapper(page_idx_user_relative, idx_on_page):
 def jump_to_absolute_idx(target_abs_idx_str, current_page_idx_user_relative, current_idx_on_page):
     global current_page_data
+    # Fallback return using current state if jump fails
+    def _fallback_return(status_message_suffix=""):
+        current_state = get_sample(current_page_idx_user_relative, current_idx_on_page, CURRENT_USERNAME)
+        status = current_state[2] + status_message_suffix
+        return current_page_idx_user_relative, current_idx_on_page, current_state[0], gr.update(value=current_state[1], interactive=current_state[5]), gr.update(value=current_state[3], elem_classes=[current_state[4]]), status, current_state[7], current_state[8], current_state[6]
     try:
         target_abs_idx = int(target_abs_idx_str)
         user_allowed_range = get_user_allowed_range(CURRENT_USERNAME)
         if not user_allowed_range or not is_within_range(target_abs_idx, user_allowed_range):
+            return _fallback_return(f" [Target index {target_abs_idx} is outside your assigned range {user_allowed_range or 'N/A'}.]")
         user_start_abs, _ = user_allowed_range
         offset_from_user_start = target_abs_idx - user_start_abs
         if offset_from_user_start < 0:
+            return _fallback_return(f" [Logic Error: Target index {target_abs_idx} has negative offset from user start {user_start_abs}.]")
         new_user_relative_page_idx = offset_from_user_start // PAGE_SIZE
+        # load_page_data updates global current_page_data and current_page
         temp_page_data_df = load_page_data(new_user_relative_page_idx)
         if temp_page_data_df is None or temp_page_data_df.empty:
+             return _fallback_return(f" [No data found for your page {new_user_relative_page_idx} (containing abs index {target_abs_idx})].")
+        # Calculate new_idx_on_page based on the target_abs_idx relative to the start of the loaded page
+        # The loaded page (current_page_data) now starts at `user_start_abs + new_user_relative_page_idx * PAGE_SIZE`
+        page_actual_start_abs = current_page_data.iloc[0]['absolute_idx'] if not current_page_data.empty else -1
+        if page_actual_start_abs == -1: # Should not happen if temp_page_data_df was not empty
+            return _fallback_return(f" [Error: Page {new_user_relative_page_idx} loaded empty unexpectedly.]")
+        new_idx_on_page_actual = target_abs_idx - page_actual_start_abs
+        if not (0 <= new_idx_on_page_actual < len(current_page_data)):
+            # This means target_abs_idx was in the user's range for this page, but the page didn't actually contain it
+            # (e.g. dataset ended prematurely within this page's expected span)
+            # Default to first item on the successfully loaded (but perhaps shorter) page.
+            print(f"Warning: Target index {target_abs_idx} resulted in out-of-bounds id_on_page ({new_idx_on_page_actual}) for loaded page. Defaulting to 0.")
+            new_idx_on_page_actual = 0
+            if current_page_data.empty: # Should be caught above
+                 return _fallback_return(f" [Page {new_user_relative_page_idx} is empty after load attempt for jump.]")
         return load_interface_data(new_user_relative_page_idx, new_idx_on_page_actual)
     except ValueError:
+        return _fallback_return(" [Invalid index format for jump.]")
     except Exception as e:
         import traceback
+        print(f"Error jumping to index: {str(e)}\n{traceback.format_exc()}")
+        return _fallback_return(f" [Error jumping to index: {str(e)}]")
 def trim_audio_action(page_idx_user_relative, idx_on_page, trim_start_str, trim_end_str):
     def _return_current_state_with_message(msg_suffix):
+        loaded_data = load_interface_data(page_idx_user_relative, idx_on_page)
+        return (*loaded_data[0:5], loaded_data[5] + f" [{msg_suffix}]", *loaded_data[6:])
     if SECOND_PHASE: return _return_current_state_with_message("Trimming disabled in Review Phase.")
     actual_sample_info = current_page_data.iloc[idx_on_page]
     absolute_idx = actual_sample_info['absolute_idx']
     original_audio_path_info = get_audio_path(actual_sample_info["audio"])
     source_basename_for_trimmed_file = os.path.basename(str(original_audio_path_info)) if isinstance(original_audio_path_info, str) else f"sample_raw_data_{absolute_idx}"
     audio_seg = None
     temp_dir_for_download = None
     try:
+        if isinstance(original_audio_path_info, tuple):
             sr, audio_array = original_audio_path_info
             if not isinstance(audio_array, np.ndarray): return _return_current_state_with_message("Raw audio data is not a numpy array.")
             if audio_array.size == 0: return _return_current_state_with_message("Cannot trim empty audio array.")
             audio_array = np.ascontiguousarray(audio_array)
             channels = 1 if audio_array.ndim == 1 else (audio_array.shape[1] if audio_array.ndim == 2 and audio_array.shape[1] in [1,2] else (audio_array.shape[0] if audio_array.ndim == 2 and audio_array.shape[0] in [1,2] else 0))
             if channels == 0: return _return_current_state_with_message(f"Unsupported audio array shape or channels: {audio_array.shape}")
+            if audio_array.ndim == 2 and audio_array.shape[0] < audio_array.shape[1] and audio_array.shape[0] in [1, 2]: audio_array = np.ascontiguousarray(audio_array.T)
             if audio_array.dtype == np.float32 or audio_array.dtype == np.float64: audio_array_int = (audio_array * np.iinfo(np.int16).max).astype(np.int16)
             elif audio_array.dtype == np.int16: audio_array_int = audio_array
+            elif audio_array.dtype == np.int32: audio_array_int = (audio_array >> 16).astype(np.int16)
             else: return _return_current_state_with_message(f"Unsupported numpy array dtype for raw audio: {audio_array.dtype}")
             sample_width = audio_array_int.itemsize
             audio_seg = AudioSegment(data=audio_array_int.tobytes(), sample_width=sample_width, frame_rate=sr, channels=channels)
+        elif isinstance(original_audio_path_info, str):
             audio_to_load = original_audio_path_info
             if not (os.path.exists(audio_to_load) or audio_to_load.startswith("http")): return _return_current_state_with_message("Audio file path is invalid, does not exist, or is not a valid URL.")
             if audio_to_load.startswith("http"):
             audio_seg = AudioSegment.from_file(audio_to_load)
         else:
             return _return_current_state_with_message("Trimming not supported for this audio source.")
         if audio_seg is None: return _return_current_state_with_message("Failed to load audio segment.")
         try: start_s, end_s = float(trim_start_str), float(trim_end_str)
         except ValueError: return _return_current_state_with_message("Invalid trim times: Start and End must be numbers.")
         start_ms, end_ms, audio_duration_ms = int(start_s * 1000), int(end_s * 1000), len(audio_seg)
         if not (0 <= start_ms < end_ms and end_ms <= audio_duration_ms):
+             return _return_current_state_with_message(f"Invalid trim times: start={start_s}s, end={end_s}s for audio of {audio_duration_ms/1000.0:.2f}s.")
         trimmed_seg = audio_seg[start_ms:end_ms]
         os.makedirs("trimmed_audio", exist_ok=True)
         safe_voice_name = re.sub(r'[^\w.-]', '_', source_basename_for_trimmed_file)
         trimmed_filename = f"trimmed_{absolute_idx}_{safe_voice_name}"
+        if not os.path.splitext(trimmed_filename)[1]: trimmed_filename += ".wav"
         trimmed_path = os.path.join("trimmed_audio", trimmed_filename)
         export_format = os.path.splitext(trimmed_path)[1][1:].lower() or "wav"
         trimmed_seg.export(trimmed_path, format=export_format)
         dataset_model = load_saved_annotations()
         sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
         if not sample:
             sample = Sample(id=absolute_idx, voice_name=os.path.basename(str(get_audio_path(actual_sample_info["audio"]) or f"sample_{absolute_idx}")),
                             original_subtitle=actual_sample_info["sentence"], annotations=[])
             dataset_model.samples = dataset_model.samples or []
             dataset_model.samples.append(sample)
         now = datetime.now()
         annotation = next((a for a in sample.annotations or [] if a.annotator == CURRENT_USERNAME), None)
         if not annotation:
             annotation = Annotation(annotator=CURRENT_USERNAME, create_at=now, update_at=now)
             sample.annotations = sample.annotations or []
             sample.annotations.append(annotation)
         annotation.audio_trims = [AudioTrim(start=start_s, end=end_s)]
         annotation.update_at = now
         save_annotations(dataset_model)
         # Return full state, but with new audio path and status message
+        loaded_data_after_trim = load_interface_data(page_idx_user_relative, idx_on_page)
+        # The audio path needs to be overridden here to show the trimmed path
+        return (loaded_data_after_trim[0], loaded_data_after_trim[1], trimmed_path,
+                loaded_data_after_trim[3], loaded_data_after_trim[4],
+                loaded_data_after_trim[5] + " [Trimmed]",
+                *loaded_data_after_trim[6:])
     except Exception as e:
         import traceback
         print(f"Error during trim_audio_action for abs_idx {absolute_idx}: {str(e)}\n{traceback.format_exc()}")
 def undo_trim_action(page_idx_user_relative, idx_on_page):
     def _return_current_state_with_message(msg_suffix):
+        return load_interface_data(page_idx_user_relative, idx_on_page)[0:5] + \
+               (load_interface_data(page_idx_user_relative, idx_on_page)[5] + f" [{msg_suffix}]",) + \
+               load_interface_data(page_idx_user_relative, idx_on_page)[6:]
     if SECOND_PHASE: return _return_current_state_with_message("Undo Trim disabled in Review Phase.")
     if current_page_data is None or idx_on_page < 0 or idx_on_page >= len(current_page_data):
         return _return_current_state_with_message("Audio data not available (page error).")
             annotation.audio_trims = None
             annotation.update_at = datetime.now()
             save_annotations(dataset_model)
+    return _return_current_state_with_message("Trim undone") # Reloads UI showing original audio
 def confirm_delete_audio_action(page_idx_user_relative, idx_on_page):
+    def _return_current_state_with_message(msg_suffix=""): # Default to no suffix if just reloading
         loaded_data = load_interface_data(page_idx_user_relative, idx_on_page)
+        return (*loaded_data[0:5], loaded_data[5] + f" [{msg_suffix}]" if msg_suffix else loaded_data[5], *loaded_data[6:])
+    if SECOND_PHASE:
+        return _return_current_state_with_message("Delete disabled in Review Phase.")
     if current_page_data is None or idx_on_page < 0 or idx_on_page >= len(current_page_data):
+        return _return_current_state_with_message("Audio data not available (page error).")
     absolute_idx = current_page_data.iloc[idx_on_page]['absolute_idx']
     voice_name_original = os.path.basename(str(get_audio_path(current_page_data.iloc[idx_on_page]["audio"]) or f"sample_{absolute_idx}"))
     dataset_model = load_saved_annotations()
     sample = next((s for s in dataset_model.samples or [] if s.id == absolute_idx), None)
     if not sample:
+        sample = Sample(id=absolute_idx, voice_name=voice_name_original,
+                        original_subtitle=current_page_data.iloc[idx_on_page]["sentence"], annotations=[])
         dataset_model.samples = dataset_model.samples or []
         dataset_model.samples.append(sample)
     sample.ignore_it = True
     now = datetime.now()
     deleted_text_marker = "AUDIO DELETED (This audio has been removed.)"
         sample.annotations = sample.annotations or []
         sample.annotations.append(annotation)
     save_annotations(dataset_model)
+    return _return_current_state_with_message() # Reload interface to show deleted status
 def sanitize_string(s):
     if not isinstance(s, str): s = str(s)
     return re.sub(r'[^\w-./]', '_', s)
         print("Cannot push to hub: No token provided for push_to_hub_with_retry.")
         return
     print(f"Pushing dataset to {repo_id}")
+    dataset_dict.push_to_hub(repo_id, private=private, token=token_val) # Make sure this token has write access
 def export_to_huggingface(repo_name_str, hf_token_for_export, progress=gr.Progress()):
     if not hf_token_for_export:
         dataset_model_annotations = load_saved_annotations()
         current_total_samples = total_samples
         if current_total_samples <= 0:
+            info = get_dataset_info()
             current_total_samples = total_samples
             if current_total_samples <= 0:
                 return "Export failed: Total number of samples is unknown or invalid."
+        ds_source = load_dataset(HF_DATASET_NAME, split="train", streaming=False, token=hf_token_for_export) # Use token for private source
+        iteration_limit = len(ds_source)
+        if iteration_limit != current_total_samples:
+             print(f"Warning: Source dataset length ({iteration_limit}) mismatches cached total_samples ({current_total_samples}). Using source length for export.")
         exported_data_list = []
         for i, source_sample in enumerate(ds_source):
             if i >= iteration_limit: break
             num_processed_from_source +=1
             absolute_idx = i
             audio_entry = source_sample.get("audio")
             sentence_val = source_sample.get("sentence", "")
             audio_dict_to_export = audio_entry
                         approved_anns = [a for a in annotation_data.annotations if a.second_phase_review_status == "approved"]
                         if SECOND_PHASE and approved_anns:
                             best_ann = sorted(approved_anns, key=lambda x: x.second_phase_review_timestamp or datetime.min, reverse=True)[0]
                         if not best_ann:
                             accepted_anns = [a for a in annotation_data.annotations if a.is_first_phase_accepted]
                             best_ann = sorted(accepted_anns, key=lambda x: x.update_at, reverse=True)[0] if accepted_anns else None
                         if not best_ann:
                              best_ann = sorted(annotation_data.annotations, key=lambda x: x.update_at, reverse=True)[0]
                     if best_ann:
                         sentence_val = best_ann.annotated_subtitle if best_ann.annotated_subtitle is not None else sentence_val
+                        if best_ann.audio_trims and audio_dict_to_export:
+                            original_audio_path_for_trim_lookup = get_audio_path(audio_entry)
                             original_voice_name_for_trim = os.path.basename(str(original_audio_path_for_trim_lookup or f"sample_{absolute_idx}"))
                             safe_voice_name_for_trim = re.sub(r'[^\w.-]', '_', original_voice_name_for_trim)
                             trimmed_fname_base = f"trimmed_{absolute_idx}_{safe_voice_name_for_trim}"
                             potential_trimmed_path = os.path.join("trimmed_audio", trimmed_fname_base + ".wav")
                             if os.path.exists(potential_trimmed_path):
                                 try:
+                                    arr, sr_trim = sf.read(potential_trimmed_path) # Renamed sr to sr_trim
+                                    audio_dict_to_export = {"array": arr, "sampling_rate": sr_trim}
                                 except Exception as e_read_trim:
+                                    print(f"Warning: Could not read trimmed audio file {potential_trimmed_path} for sample {absolute_idx}: {e_read_trim}.")
+                            # else: # Keep original audio_dict_to_export
             exported_data_list.append({
                 "audio": audio_dict_to_export,
                 "sentence": sanitize_sentence(sentence_val)
             })
             if (i + 1) % 100 == 0:
                 progress((i + 1) / iteration_limit, f"Processed {i+1}/{iteration_limit} samples")
             gc.collect()
+        if not exported_data_list: return "No data to export after processing."
+        for item in exported_data_list: # Ensure audio format before creating Dataset
+            audio_item = item["audio"]
+            if audio_item is None or (isinstance(audio_item, dict) and audio_item.get('path') is None and audio_item.get('array') is None):
+                 item["audio"] = {"array": np.array([], dtype=np.float32), "sampling_rate": 16000} # Placeholder for missing/deleted
         try:
             final_dataset = Dataset.from_list(exported_data_list)
+            # Cast audio, ensure all items have 'array' and 'sampling_rate' or valid 'path'
             final_dataset = final_dataset.cast_column("audio", Audio(sampling_rate=16000))
         except Exception as e_cast:
             print(f"Error during Dataset.from_list or cast_column: {e_cast}")
             for idx_problem, problematic_item in enumerate(exported_data_list[:5]):
+                print(f"Sample item {idx_problem} for export: Audio type {type(problematic_item['audio'])}, Content: {str(problematic_item['audio'])[:200]}")
+            return f"Export failed during data conversion: {e_cast}."
         dataset_dict_export = DatasetDict({"train": final_dataset})
         progress(0.95, "Uploading to Hugging Face...")
+        try:
+            current_hf_user = whoami(token=hf_token_for_export)['name']
+        except Exception as e_whoami_export:
+            return f"Export failed: Could not verify Hugging Face user with provided token: {e_whoami_export}"
+        dataset_name_part = repo_name_str.split('/')[-1] # Get 'my-annotated-dataset' from 'user/my-annotated-dataset'
+        target_repo_id = f"{current_hf_user}/{dataset_name_part}"
+        push_to_hub_with_retry(dataset_dict=dataset_dict_export, repo_id=target_repo_id, private=True, token_val=hf_token_for_export)
         end_time = time.time()
         print(f"Upload done, total time: {end_time - start_time:.2f}s")
         progress(1.0, "Upload complete!")
         return f"Exported to huggingface.co/datasets/{target_repo_id}"
     except Exception as e:
         import traceback
         error_msg = f"Export failed: {str(e)}"
         print(f"{error_msg}\n{traceback.format_exc()}")
         return error_msg
+def hf_login(hf_token_val_ui):
+    global CURRENT_USERNAME, token, current_page_data, total_samples, annotator_ranges, SECOND_PHASE_REVIEW_MAPPING, annotation_count
+    # Reset session-specific annotation count on new login
+    annotation_count = 0
+    # Default state for UI elements on login failure or before successful load
     failed_login_transcript_update = gr.update(value="", interactive=False)
     def _failed_login_outputs(login_msg_text, reviewer_text_val="N/A"):
+        # This function constructs the 19-tuple for login outputs
         return (
             gr.update(visible=True), gr.update(visible=False),  # login_container, main_container
+            gr.update(value=reviewer_text_val), hf_token_val_ui, login_msg_text, # reviewer_tb, hf_token_state, login_message
+            gr.update(visible=False), failed_login_transcript_update, # save_next_button, transcript_tb (interactive)
+            gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), # trim, undo_trim, delete buttons
             gr.update(visible=False, value=False),  # first_phase_accept_cb (vis & val)
+            gr.update(visible=False), gr.update(visible=False),  # approve_button, reject_button
+            0, 0, None, failed_login_transcript_update, # page_idx, idx_on_page, audio, transcript_tb (value)
             login_msg_text if "failed" in login_msg_text.lower() or "error" in login_msg_text.lower() else "Please log in.", # status_md
             "" # original_transcript_state
         )
+    if not hf_token_val_ui:
         return _failed_login_outputs("Login failed: Token cannot be empty.")
     try:
+        print(f"Attempting login with token from UI...")
+        user_info = whoami(token=hf_token_val_ui)
         username = user_info['name']
+        print(f"whoami successful for user: {username}")
         if username in ALLOWED_USERS:
             CURRENT_USERNAME = username
+            token = hf_token_val_ui # IMPORTANT: Set the global token to the one provided in UI
+            print(f"User '{CURRENT_USERNAME}' is in ALLOWED_USERS. Global token updated.")
+            # Crucial: Fetch dataset info and ranges AFTER successful login & token set
+            # Reset total_samples to ensure it's re-fetched with the new token if necessary
+            total_samples = 0
+            ds_info = get_dataset_info()
             if total_samples <= 0:
+                return _failed_login_outputs(f"Login OK for {CURRENT_USERNAME}, but failed to get dataset size. Cannot proceed.", reviewer_text_val="Error: No Dataset Size")
             annotator_ranges = calculate_annotator_ranges(total_samples, ANNOTATORS)
             if SECOND_PHASE:
+                # SECOND_PHASE_REVIEW_MAPPING.clear() # Clear previous mapping
+                initialize_second_phase_assignments() # This uses global annotator_ranges
             user_allowed_range_check = get_user_allowed_range(CURRENT_USERNAME)
             if not user_allowed_range_check or user_allowed_range_check[0] > user_allowed_range_check[1]:
+                 return _failed_login_outputs(f"Login OK for {CURRENT_USERNAME}, but no samples assigned for {'review' if SECOND_PHASE else 'annotation'}.", reviewer_text_val="No Samples Assigned")
+            current_page_data = load_page_data(0) # page_num_within_user_view = 0
+            # Check if page loading actually got data
+            initial_idx_on_page = 0
+            if current_page_data is None or current_page_data.empty:
+                print(f"Warning: Initial page load for user {CURRENT_USERNAME} resulted in no data.")
+                # Attempt to load interface with (0,0) but expect "no data" messages from get_sample
+                initial_idx_on_page = 0 # or handle as error if no data at all is critical
+            # load_interface_data returns a 9-tuple
+            initial_load_tuple = load_interface_data(current_page, initial_idx_on_page)
+            is_second_phase_active = SECOND_PHASE
             # Structure for login_outputs (19 items)
             return (
+                gr.update(visible=False),  # 0 login_container
+                gr.update(visible=True),   # 1 main_container
+                initial_load_tuple[4],     # 2 reviewer_tb (gr.update obj from load_interface_data)
+                hf_token_val_ui,           # 3 hf_token_state (value) -> updates the gr.State
+                f"Login successful! Welcome {CURRENT_USERNAME}. Phase: {'Review' if is_second_phase_active else 'Annotation'}.", # 4 login_message
+                gr.update(visible=not is_second_phase_active),  # 5 save_next_button (visibility)
+                initial_load_tuple[3],     # 6 transcript_tb (gr.update obj for value and interactivity)
+                gr.update(visible=not is_second_phase_active),  # 7 trim_button (visibility)
+                gr.update(visible=not is_second_phase_active),  # 8 undo_trim_button (visibility)
+                gr.update(visible=not is_second_phase_active),  # 9 delete_button (visibility)
+                gr.update(visible=initial_load_tuple[7]['visible'], value=initial_load_tuple[8]), # 10 first_phase_accept_cb (vis from [7], val from [8])
+                gr.update(visible=is_second_phase_active),  # 11 approve_button (visibility)
+                gr.update(visible=is_second_phase_active),  # 12 reject_button (visibility)
+                initial_load_tuple[0],     # 13 current_page_idx_state (value)
+                initial_load_tuple[1],     # 14 current_idx_on_page_state (value)
+                initial_load_tuple[2],     # 15 audio_player (value or gr.update obj)
+                initial_load_tuple[3],     # 16 transcript_tb (can be same as 6, Gradio handles it)
+                initial_load_tuple[5],     # 17 status_md (value)
+                initial_load_tuple[6]      # 18 original_transcript_state (value)
             )
+        else:
             CURRENT_USERNAME = None
+            token = None # Clear global token if auth fails or user not allowed
+            return _failed_login_outputs(f"User '{username}' not in allowed user list.", reviewer_text_val="Unauthorized")
     except Exception as e:
         CURRENT_USERNAME = None
+        token = None # Clear global token on any login exception
         import traceback
         login_err_msg = f"Login failed: {str(e)}"
         print(f"{login_err_msg}\n{traceback.format_exc()}")
         return _failed_login_outputs(login_err_msg, reviewer_text_val="Login Error")
+# Gradio Interface (largely same as your previous version)
 css = """
 .white { background-color: white; color: black; } .yellow { background-color: yellow; color: black; }
 .blue { background-color: lightblue; color: black; } .green { background-color: lightgreen; color: black; }
 .reviewer-textbox input { text-align: center; font-weight: bold; }
 """
 with gr.Blocks(css=css, title="ASR Dataset Labeling Tool") as demo:
+    # hf_token_state will store the token provided via UI and used for operations.
+    # Initialize with env var 'token' if available, otherwise empty.
+    # This gr.State is updated by the hf_login function's output.
+    hf_token_state = gr.State(os.getenv("hf_token") or "")
     current_page_idx_state = gr.State(0)
     current_idx_on_page_state = gr.State(0)
     original_transcript_state = gr.State("")
     with gr.Column(visible=True, elem_id="login_container") as login_container:
         gr.Markdown("## HF Authentication")
+        # hf_token_input default value is also from env var, or empty.
+        hf_token_input = gr.Textbox(label="Hugging Face Token", type="password", value=os.getenv("hf_token") or "")
         login_button = gr.Button("Login")
         login_message = gr.Markdown("")
                 jump_text_tb = gr.Textbox(label="Jump to Global Index", placeholder="Enter dataset absolute index")
                 jump_button = gr.Button("Jump")
             with gr.Row():
+                # Default repo name will be updated more accurately if user logs in.
+                # For now, a generic placeholder.
+                hf_repo_name_tb = gr.Textbox(label="Export Repository Name (your_hf_username/dataset-name)", value="your-hf-username/my-annotated-asr-dataset")
                 hf_export_button = gr.Button("Export to Hugging Face", variant="primary")
             hf_export_status_md = gr.Markdown("")
     # Outputs for login_button (19 outputs)
     login_outputs = [
+        login_container, main_container, reviewer_tb, hf_token_state, login_message, # 0-4
+        save_next_button, transcript_tb, trim_button, undo_trim_button, delete_button, # 5-9
+        first_phase_accept_cb, # 10 (this receives a gr.update obj with 'visible' and 'value' keys)
+        approve_button, reject_button, # 11-12
+        current_page_idx_state, current_idx_on_page_state, audio_player, # 13-15
+        transcript_tb, # 16 (target for transcript value, can be same as #6)
+        status_md, original_transcript_state # 17-18
     ]
     login_button.click(fn=hf_login, inputs=[hf_token_input], outputs=login_outputs)
+    # Common outputs for navigation and actions that reload sample view (9 outputs from load_interface_data)
+    # (page_idx_state, idx_on_page_state, audio_player, transcript_tb_update, reviewer_tb_update,
+    #  status_md, original_transcript_state, first_phase_accept_cb_vis_update, first_phase_accept_cb_val)
     navigation_outputs_extended = [
+        current_page_idx_state, current_idx_on_page_state, # States
+        audio_player, transcript_tb, reviewer_tb, status_md, original_transcript_state, # UI components
+        first_phase_accept_cb, # For visibility update (receives gr.update(visible=...))
+        first_phase_accept_cb  # For value update (receives value directly, Gradio checkbox handles it)
     ]
     save_next_button.click(
         inputs=[current_page_idx_state, current_idx_on_page_state],
         outputs=navigation_outputs_extended
     )
     approve_button.click(
         fn=review_and_next_sample_second_phase,
         inputs=[current_page_idx_state, current_idx_on_page_state, gr.State("approved")],
         inputs=[current_page_idx_state, current_idx_on_page_state, gr.State("rejected")],
         outputs=navigation_outputs_extended
     )
     trim_button.click(
         fn=trim_audio_action,
         inputs=[current_page_idx_state, current_idx_on_page_state, trim_start_tb, trim_end_tb],
     delete_button.click(
         fn=confirm_delete_audio_action,
         inputs=[current_page_idx_state, current_idx_on_page_state],
+        outputs=navigation_outputs_extended
     )
     jump_button.click(
         fn=jump_to_absolute_idx,
         inputs=[jump_text_tb, current_page_idx_state, current_idx_on_page_state],
     )
     hf_export_button.click(
         fn=export_to_huggingface,
+        inputs=[hf_repo_name_tb, hf_token_state], # Use hf_token_state here
         outputs=[hf_export_status_md],
         queue=True
     )
 if __name__ == "__main__":
+    # Initializations that don't depend on login token can be here
+    # For example, setting SECOND_PHASE based on an env var or config file.
+    # However, total_samples and annotator_ranges should primarily be determined *after* login,
+    # as they might depend on the dataset accessible by the user's token.
+    # Example: Override SECOND_PHASE for testing
+    # os.environ['APP_SECOND_PHASE'] = "True"
+    # SECOND_PHASE = os.getenv('APP_SECOND_PHASE', 'False').lower() == 'true'
+    print(f"Application starting. Second phase mode: {SECOND_PHASE}")
+    # Initial dataset info try (might fail if token needed and not globally set from env)
+    # This is mostly for informational purposes before login, hf_login will do a more robust fetch.
+    if total_samples <= 0:
+        print("Main block: total_samples not yet set. Will be determined after login.")
     if SECOND_PHASE:
         print("==== APPLICATION LAUNCHING IN SECOND PHASE (REVIEW MODE) ====")
+        # Initialization of SECOND_PHASE_REVIEW_MAPPING will happen after login,
+        # once total_samples and annotator_ranges are confirmed.
     else:
         print("==== APPLICATION LAUNCHING IN FIRST PHASE (ANNOTATION MODE) ====")