Commit
·
03986eb
1
Parent(s):
163b772
add transcribing audio
Browse files
main.py
CHANGED
@@ -73,7 +73,7 @@ class ProcessRequest(BaseModel):
|
|
73 |
def download_audio_bytes(youtube_url: str) -> bytes:
|
74 |
|
75 |
# Downloads the best audio-only format from a YouTube URL using yt-dlp and returns the raw audio data as bytes.
|
76 |
-
|
77 |
print(f"Attempting to download audio for: {youtube_url}")
|
78 |
ydl_opts = {
|
79 |
'format': 'bestaudio/best', # Prioritize best audio-only, fallback to best audio in general
|
@@ -152,4 +152,33 @@ def download_audio_bytes(youtube_url: str) -> bytes:
|
|
152 |
except Exception as e:
|
153 |
print(f"ERROR: Unexpected error during audio download: {e}")
|
154 |
# Log the full traceback here in a real app: import traceback; traceback.print_exc()
|
155 |
-
raise HTTPException(status_code=500, detail=f"An unexpected error occurred during audio processing: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
def download_audio_bytes(youtube_url: str) -> bytes:
|
74 |
|
75 |
# Downloads the best audio-only format from a YouTube URL using yt-dlp and returns the raw audio data as bytes.
|
76 |
+
|
77 |
print(f"Attempting to download audio for: {youtube_url}")
|
78 |
ydl_opts = {
|
79 |
'format': 'bestaudio/best', # Prioritize best audio-only, fallback to best audio in general
|
|
|
152 |
except Exception as e:
|
153 |
print(f"ERROR: Unexpected error during audio download: {e}")
|
154 |
# Log the full traceback here in a real app: import traceback; traceback.print_exc()
|
155 |
+
raise HTTPException(status_code=500, detail=f"An unexpected error occurred during audio processing: {e}")
|
156 |
+
|
157 |
+
|
158 |
+
def transcribe_audio(audio_bytes: bytes) -> str:
|
159 |
+
"""
|
160 |
+
Sends audio bytes to the Hugging Face ASR (Automatic Speech Recognition) API.
|
161 |
+
"""
|
162 |
+
if not hf_inference:
|
163 |
+
raise HTTPException(status_code=503, detail="Transcription service client not initialized.")
|
164 |
+
if not audio_bytes:
|
165 |
+
raise ValueError("Cannot transcribe empty audio data.")
|
166 |
+
|
167 |
+
print(f"Transcribing {len(audio_bytes) / (1024*1024):.2f} MB using {ASR_MODEL}...")
|
168 |
+
try:
|
169 |
+
# Use the InferenceClient for ASR task
|
170 |
+
# It expects the raw audio bytes
|
171 |
+
transcript_result = hf_inference.automatic_speech_recognition(
|
172 |
+
audio=audio_bytes,
|
173 |
+
model=ASR_MODEL
|
174 |
+
)
|
175 |
+
transcript = transcript_result.get('text', '').strip() # Get text, default to '', remove whitespace
|
176 |
+
if not transcript:
|
177 |
+
print("Warning: Transcription result was empty.")
|
178 |
+
# Decide: return empty string or raise error? Let's return empty for now.
|
179 |
+
print("Transcription successful.")
|
180 |
+
return transcript
|
181 |
+
except Exception as e:
|
182 |
+
print(f"ERROR: Hugging Face ASR API call failed: {e}")
|
183 |
+
# Check for specific HF error types if possible
|
184 |
+
raise HTTPException(status_code=503, detail=f"Transcription service failed: {e}") # 503 Service Unavailable
|