File size: 14,380 Bytes
6716568 ab6db1e 6716568 cd16225 6716568 ab6db1e 6716568 ab6db1e 6716568 3faf1d8 fb1ef83 725e2f6 03986eb c1ef5cd fb1ef83 cd16225 c1ef5cd 03986eb cd16225 03986eb cd16225 03986eb cd16225 03986eb cd16225 03986eb cd16225 03986eb cd16225 28eae3c ab6db1e 28eae3c ab6db1e 28eae3c ab6db1e 28eae3c ab6db1e 28eae3c ab6db1e 28eae3c ab6db1e 28eae3c ab6db1e cd16225 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 |
# main.py
import os
import io # For handling bytes data in memory
import yt_dlp # YouTube audio downloader
import requests # For making HTTP requests (to audio URLs)
import openai # for making requests to openrouter
from fastapi import FastAPI, HTTPException, Request # The web framework
from fastapi.middleware.cors import CORSMiddleware # For allowing frontend access
from pydantic import BaseModel # For data validation
from pydub import AudioSegment # For splitting audio into chunks
from huggingface_hub import InferenceClient # HF API client
from dotenv import load_dotenv # To load .env file locally
# --- Initial Setup ---
# Load environment variables from .env file (for local development)
# In HF Spaces, secrets are set in the Space settings, not via .env
load_dotenv()
HF_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
OPENROUTER_API_KEY= os.getenv("OPENROUTER_API_KEY")
openai.api_key = OPENROUTER_API_KEY
openai.api_base = "https://openrouter.ai/api/v1"
# Check if the API key is loaded (crucial!)
if not HF_API_KEY:
print("ERROR: HUGGINGFACE_API_KEY environment variable not found.")
# I might want to exit or raise an error here in a real deployment
# For now, we'll let it proceed but API calls will fail later.
# Define the models we'll use from Hugging Face
# I can change these! Smaller Whisper models (base, small, medium) are faster.
# Different LLMs have different strengths.
ASR_MODEL = "openai/whisper-large-v3"
LLM_MODEL = "microsoft/mai-ds-r1:free"
# Initialize the Hugging Face Inference Client
# Handles authentication using the API key automatically
try:
hf_inference = InferenceClient(token=HF_API_KEY)
print("Hugging Face Inference Client initialized.")
except Exception as e:
print(f"ERROR: Failed to initialize Hugging Face Inference Client: {e}")
hf_inference = None # Ensure it's None if initialization fails
# Initialize the FastAPI application
app = FastAPI(
title="Video Note Taker API",
description="Transcribes videos and generates notes using Hugging Face models.",
version="0.1.0",
)
# --- CORS Configuration ---
# Configure Cross-Origin Resource Sharing (CORS)
# This is VITAL to allow Vercel frontend (running on a different domain)
# to make requests to this backend API.
origins = [
"http://localhost:3000", # Allow my local frontend dev server
# !!! IMPORTANT: Add my DEPLOYED Vercel frontend URL here later !!!
# Example: "https://videos-notes-app.vercel.app",
]
app.add_middleware(
CORSMiddleware,
allow_origins=origins, # List of allowed origins
allow_credentials=True, # Allow cookies (not strictly needed now, but good practice)
allow_methods=["*"], # Allow all HTTP methods (GET, POST, etc.)
allow_headers=["*"], # Allow all headers
)
# --- Data Models (Request Validation) ---
# Define the expected structure of the request body using Pydantic
class ProcessRequest(BaseModel):
youtubeUrl: str # Expecting a field named "youtubeUrl" which is a string
def download_audio_bytes(youtube_url: str) -> bytes:
"""
Downloads the best audio-only format from a YouTube URL using yt-dlp and returns the raw audio data as bytes.
"""
print(f"Attempting to download audio for: {youtube_url}")
ydl_opts = {
'format': 'bestaudio/best', # Prioritize best audio-only, fallback to best audio in general
'noplaylist': True, # Don't download playlist if URL is part of one
'quiet': True, # Suppress yt-dlp console output
'no_warnings': True,
'postprocessors': [{ # Use ffmpeg (if installed) to extract audio if needed
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3', # Request MP3 format (widely compatible)
'preferredquality': '128', # Lower quality = smaller file = faster processing
}],
# Limit duration - uncomment and adjust if needed to prevent very long processing
# 'download_ranges': yt_dlp.utils.download_range_func(None, [(0, 1200)]), # Example: Max 20 minutes (1200 seconds)
}
buffer = io.BytesIO() # Create an in-memory binary buffer
try:
# Use yt-dlp's ability to write to a file-like object
ydl_opts['outtmpl'] = '-' # Special template meaning stdout
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
# Trick: Use a hook to capture stdout to our buffer instead of printing
# This is complex; simpler method below is preferred if ffmpeg isn't used
# Or, a better way: get the direct audio URL first
# --- Simpler & Often Better Approach: Get URL, then download with requests ---
info = ydl.extract_info(youtube_url, download=False) # Get info without downloading yet
best_audio_format = None
for f in info.get('formats', []):
# Look for formats processed by FFmpegExtractAudio or good audio codecs
if f.get('acodec') != 'none' and f.get('vcodec') == 'none': # Audio-only
if f.get('ext') in ['mp3', 'opus', 'm4a', 'webm']: # Prefer known good audio containers/codecs
best_audio_format = f
break # Take the first good one
# Fallback if no ideal format found
if not best_audio_format:
for f in info.get('formats', []):
if f.get('acodec') != 'none':
best_audio_format = f
break # Take first available audio
if not best_audio_format or 'url' not in best_audio_format:
print("Could not find suitable audio stream URL via yt-dlp info. Direct download might fail or require ffmpeg.")
# If you *don't* have ffmpeg in the Dockerfile, the postprocessor might fail here
# Let's try the download anyway, it might work for some native formats
# This path is less reliable without guaranteed ffmpeg.
error_info = ydl.download([youtube_url]) # Try downloading directly (might need ffmpeg)
# This part is complex - capturing output might need more work if direct URL fetch failed.
# Let's raise an error if we couldn't get a direct URL for now.
raise yt_dlp.utils.DownloadError("Could not extract a direct audio URL and ffmpeg may not be available.")
audio_url = best_audio_format['url']
format_note = best_audio_format.get('format_note', best_audio_format.get('ext', 'N/A'))
print(f"Found audio format: {format_note}. Downloading directly from URL...")
# Download the audio URL content into the buffer
with requests.get(audio_url, stream=True) as r:
r.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
for chunk in r.iter_content(chunk_size=8192):
buffer.write(chunk)
audio_bytes = buffer.getvalue()
print(f"Audio downloaded successfully: {len(audio_bytes) / (1024*1024):.2f} MB")
if not audio_bytes:
raise ValueError("Downloaded audio data is empty.")
audio_format = best_audio_format.get('ext', 'mp3') # e.g., "webm", "m4a", etc.
return audio_bytes, audio_format
except yt_dlp.utils.DownloadError as e:
print(f"ERROR: yt-dlp failed to download or process audio: {e}")
raise HTTPException(status_code=500, detail=f"Failed to download audio from YouTube: {e}")
except requests.exceptions.RequestException as e:
print(f"ERROR: Failed to download audio stream from URL: {e}")
raise HTTPException(status_code=500, detail=f"Failed to fetch audio stream: {e}")
except Exception as e:
print(f"ERROR: Unexpected error during audio download: {e}")
# Log the full traceback here in a real app: import traceback; traceback.print_exc()
raise HTTPException(status_code=500, detail=f"An unexpected error occurred during audio processing: {e}")
def chunk_audio(audio_bytes: bytes, audio_format: str = "mp3", chunk_length_ms: int = 30000) -> list[bytes]:
"""
Splits raw audio bytes into smaller chunks using the specified format.
Args:
audio_bytes (bytes): Raw audio data.
audio_format (str): The format of the audio (e.g., 'mp3', 'webm', 'm4a').
chunk_length_ms (int): Duration of each chunk in milliseconds.
Returns:
List[bytes]: List of audio chunks as bytes.
"""
try:
audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format=audio_format)
except Exception as e:
raise ValueError(f"Could not decode audio with format '{audio_format}': {e}")
chunks = []
total_length = len(audio)
for i in range(0, total_length, chunk_length_ms):
chunk = audio[i:i+chunk_length_ms]
chunk_buffer = io.BytesIO()
chunk.export(chunk_buffer, format="mp3") # Export to mp3 regardless of input format
chunks.append(chunk_buffer.getvalue())
return chunks
def transcribe_audio(audio_bytes: bytes) -> str:
"""
Sends audio bytes to the Hugging Face ASR (Automatic Speech Recognition) API.
"""
if not hf_inference:
raise HTTPException(status_code=503, detail="Transcription service client not initialized.")
if not audio_bytes:
raise ValueError("Cannot transcribe empty audio data.")
print(f"Transcribing {len(audio_bytes) / (1024*1024):.2f} MB using {ASR_MODEL}...")
try:
# Use the InferenceClient for ASR task
# It expects the raw audio bytes
transcript_result = hf_inference.automatic_speech_recognition(
audio=audio_bytes,
model=ASR_MODEL
)
transcript = transcript_result.get('text', '').strip() # Get text, default to '', remove whitespace
if not transcript:
print("Warning: Transcription result was empty.")
# Decide: return empty string or raise error? Let's return empty for now.
print("Transcription successful.")
return transcript
except Exception as e:
print(f"ERROR: Hugging Face ASR API call failed: {e}")
# Check for specific HF error types if possible
raise HTTPException(status_code=503, detail=f"Transcription service failed: {e}") # 503 Service Unavailable
def generate_notes_from_transcript(transcript: str) -> str:
"""
Sends the transcript to OpenRouter LLM (chat model) and gets structured notes back.
"""
if not OPENROUTER_API_KEY:
raise HTTPException(status_code=503, detail="OpenRouter API key not found.")
if not transcript:
return "Transcript was empty."
print(f"Generating notes for transcript (length {len(transcript)}) using {LLM_MODEL}...")
# --- Prompt Engineering: Crucial for good results! ---
# Be explicit about the desired output format and role.
prompt = f"""You are an expert note-taking assistant specializing in extracting key information from video transcripts.
Please analyze the following transcript and generate concise, well-structured notes.
Focus on the main topics, key points, important examples, definitions, and any conclusions presented. Use bullet points or numbered lists for clarity.
Transcript:
\"\"\"
{transcript}
\"\"\"
Structured Notes:"""
try:
response = openai.ChatCompletion.create(
model=LLM_MODEL,
messages=[{"role": "user", "content": prompt}],
max_tokens=1024,
temperature=0.7
)
notes = response.choices[0].message.content
print("Note generation successful.")
return notes
except Exception as e:
print(f"OpenRouter call failed: {e}")
raise HTTPException(status_code=503, detail=f"OpenRouter failed: {e}")
# --- API Endpoints ---
@app.get("/")
async def read_root():
""" Health check endpoint. Visit BASE_URL/ to see this. """
return {"message": "YouTube Notes API is running!"}
@app.get("/docs", include_in_schema=False)
async def custom_swagger_ui_html():
""" Access automatic API documentation via BASE_URL/docs """
from fastapi.openapi.docs import get_swagger_ui_html
return get_swagger_ui_html(openapi_url=app.openapi_url, title=app.title + " - Docs")
@app.get(app.openapi_url, include_in_schema=False)
async def get_open_api_endpoint():
""" Serves the OpenAPI schema """
from fastapi.openapi.utils import get_openapi
return get_openapi(title=app.title, version=app.version, routes=app.routes)
@app.post("/process-video/")
async def process_video(request: ProcessRequest):
"""
The main endpoint: receives a YouTube URL, processes it, and returns notes.
Accessible via POST request to BASE_URL/process-video/
"""
print(f"Received request for URL: {request.youtubeUrl}")
try:
# Step 1: Download Audio
audio_bytes, audio_format = download_audio_bytes(request.youtubeUrl)
# Step 2: Chunk and Transcribe Audio
audio_chunks = chunk_audio(audio_bytes, audio_format)
full_transcript = ""
for idx, chunk in enumerate(audio_chunks):
print(f"Transcribing chunk {idx + 1}/{len(audio_chunks)}...")
try:
partial = transcribe_audio(chunk)
full_transcript += partial + " "
except Exception as e:
print(f"Skipping chunk {idx + 1} due to error: {e}")
# Step 3: Generate Notes
notes = generate_notes_from_transcript(full_transcript.strip())
# Step 4: Return Notes
print("Processing complete. Returning notes.")
return {"notes": notes}
except HTTPException as http_exc:
# If an HTTPException was raised deliberately in helpers, re-raise it
print(f"HTTP Exception occurred: {http_exc.detail}")
raise http_exc
except Exception as e:
# Catch any other unexpected errors during the process
print(f"ERROR: Unhandled exception in /process-video/ endpoint: {e}")
# Log traceback here in production: traceback.print_exc()
raise HTTPException(status_code=500, detail=f"An internal server error occurred: {e}") |