# main.py import os import io # For handling bytes data in memory import yt_dlp # YouTube audio downloader import requests # For making HTTP requests (to audio URLs) import openai # for making requests to openrouter from fastapi import FastAPI, HTTPException, Request # The web framework from fastapi.middleware.cors import CORSMiddleware # For allowing frontend access from pydantic import BaseModel # For data validation from pydub import AudioSegment # For splitting audio into chunks from huggingface_hub import InferenceClient # HF API client from dotenv import load_dotenv # To load .env file locally # --- Initial Setup --- # Load environment variables from .env file (for local development) # In HF Spaces, secrets are set in the Space settings, not via .env load_dotenv() HF_API_KEY = os.getenv("HUGGINGFACE_API_KEY") OPENROUTER_API_KEY= os.getenv("OPENROUTER_API_KEY") openai.api_key = OPENROUTER_API_KEY openai.api_base = "https://openrouter.ai/api/v1" # Check if the API key is loaded (crucial!) if not HF_API_KEY: print("ERROR: HUGGINGFACE_API_KEY environment variable not found.") # I might want to exit or raise an error here in a real deployment # For now, we'll let it proceed but API calls will fail later. # Define the models we'll use from Hugging Face # I can change these! Smaller Whisper models (base, small, medium) are faster. # Different LLMs have different strengths. ASR_MODEL = "openai/whisper-large-v3" LLM_MODEL = "microsoft/mai-ds-r1:free" # Initialize the Hugging Face Inference Client # Handles authentication using the API key automatically try: hf_inference = InferenceClient(token=HF_API_KEY) print("Hugging Face Inference Client initialized.") except Exception as e: print(f"ERROR: Failed to initialize Hugging Face Inference Client: {e}") hf_inference = None # Ensure it's None if initialization fails # Initialize the FastAPI application app = FastAPI( title="Video Note Taker API", description="Transcribes videos and generates notes using Hugging Face models.", version="0.1.0", ) # --- CORS Configuration --- # Configure Cross-Origin Resource Sharing (CORS) # This is VITAL to allow Vercel frontend (running on a different domain) # to make requests to this backend API. origins = [ "http://localhost:3000", # Allow my local frontend dev server # !!! IMPORTANT: Add my DEPLOYED Vercel frontend URL here later !!! # Example: "https://videos-notes-app.vercel.app", ] app.add_middleware( CORSMiddleware, allow_origins=origins, # List of allowed origins allow_credentials=True, # Allow cookies (not strictly needed now, but good practice) allow_methods=["*"], # Allow all HTTP methods (GET, POST, etc.) allow_headers=["*"], # Allow all headers ) # --- Data Models (Request Validation) --- # Define the expected structure of the request body using Pydantic class ProcessRequest(BaseModel): youtubeUrl: str # Expecting a field named "youtubeUrl" which is a string def download_audio_bytes(youtube_url: str) -> bytes: """ Downloads the best audio-only format from a YouTube URL using yt-dlp and returns the raw audio data as bytes. """ print(f"Attempting to download audio for: {youtube_url}") ydl_opts = { 'format': 'bestaudio/best', # Prioritize best audio-only, fallback to best audio in general 'noplaylist': True, # Don't download playlist if URL is part of one 'quiet': True, # Suppress yt-dlp console output 'no_warnings': True, 'postprocessors': [{ # Use ffmpeg (if installed) to extract audio if needed 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', # Request MP3 format (widely compatible) 'preferredquality': '128', # Lower quality = smaller file = faster processing }], # Limit duration - uncomment and adjust if needed to prevent very long processing # 'download_ranges': yt_dlp.utils.download_range_func(None, [(0, 1200)]), # Example: Max 20 minutes (1200 seconds) } buffer = io.BytesIO() # Create an in-memory binary buffer try: # Use yt-dlp's ability to write to a file-like object ydl_opts['outtmpl'] = '-' # Special template meaning stdout with yt_dlp.YoutubeDL(ydl_opts) as ydl: # Trick: Use a hook to capture stdout to our buffer instead of printing # This is complex; simpler method below is preferred if ffmpeg isn't used # Or, a better way: get the direct audio URL first # --- Simpler & Often Better Approach: Get URL, then download with requests --- info = ydl.extract_info(youtube_url, download=False) # Get info without downloading yet best_audio_format = None for f in info.get('formats', []): # Look for formats processed by FFmpegExtractAudio or good audio codecs if f.get('acodec') != 'none' and f.get('vcodec') == 'none': # Audio-only if f.get('ext') in ['mp3', 'opus', 'm4a', 'webm']: # Prefer known good audio containers/codecs best_audio_format = f break # Take the first good one # Fallback if no ideal format found if not best_audio_format: for f in info.get('formats', []): if f.get('acodec') != 'none': best_audio_format = f break # Take first available audio if not best_audio_format or 'url' not in best_audio_format: print("Could not find suitable audio stream URL via yt-dlp info. Direct download might fail or require ffmpeg.") # If you *don't* have ffmpeg in the Dockerfile, the postprocessor might fail here # Let's try the download anyway, it might work for some native formats # This path is less reliable without guaranteed ffmpeg. error_info = ydl.download([youtube_url]) # Try downloading directly (might need ffmpeg) # This part is complex - capturing output might need more work if direct URL fetch failed. # Let's raise an error if we couldn't get a direct URL for now. raise yt_dlp.utils.DownloadError("Could not extract a direct audio URL and ffmpeg may not be available.") audio_url = best_audio_format['url'] format_note = best_audio_format.get('format_note', best_audio_format.get('ext', 'N/A')) print(f"Found audio format: {format_note}. Downloading directly from URL...") # Download the audio URL content into the buffer with requests.get(audio_url, stream=True) as r: r.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx) for chunk in r.iter_content(chunk_size=8192): buffer.write(chunk) audio_bytes = buffer.getvalue() print(f"Audio downloaded successfully: {len(audio_bytes) / (1024*1024):.2f} MB") if not audio_bytes: raise ValueError("Downloaded audio data is empty.") audio_format = best_audio_format.get('ext', 'mp3') # e.g., "webm", "m4a", etc. return audio_bytes, audio_format except yt_dlp.utils.DownloadError as e: print(f"ERROR: yt-dlp failed to download or process audio: {e}") raise HTTPException(status_code=500, detail=f"Failed to download audio from YouTube: {e}") except requests.exceptions.RequestException as e: print(f"ERROR: Failed to download audio stream from URL: {e}") raise HTTPException(status_code=500, detail=f"Failed to fetch audio stream: {e}") except Exception as e: print(f"ERROR: Unexpected error during audio download: {e}") # Log the full traceback here in a real app: import traceback; traceback.print_exc() raise HTTPException(status_code=500, detail=f"An unexpected error occurred during audio processing: {e}") def chunk_audio(audio_bytes: bytes, audio_format: str = "mp3", chunk_length_ms: int = 30000) -> list[bytes]: """ Splits raw audio bytes into smaller chunks using the specified format. Args: audio_bytes (bytes): Raw audio data. audio_format (str): The format of the audio (e.g., 'mp3', 'webm', 'm4a'). chunk_length_ms (int): Duration of each chunk in milliseconds. Returns: List[bytes]: List of audio chunks as bytes. """ try: audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format=audio_format) except Exception as e: raise ValueError(f"Could not decode audio with format '{audio_format}': {e}") chunks = [] total_length = len(audio) for i in range(0, total_length, chunk_length_ms): chunk = audio[i:i+chunk_length_ms] chunk_buffer = io.BytesIO() chunk.export(chunk_buffer, format="mp3") # Export to mp3 regardless of input format chunks.append(chunk_buffer.getvalue()) return chunks def transcribe_audio(audio_bytes: bytes) -> str: """ Sends audio bytes to the Hugging Face ASR (Automatic Speech Recognition) API. """ if not hf_inference: raise HTTPException(status_code=503, detail="Transcription service client not initialized.") if not audio_bytes: raise ValueError("Cannot transcribe empty audio data.") print(f"Transcribing {len(audio_bytes) / (1024*1024):.2f} MB using {ASR_MODEL}...") try: # Use the InferenceClient for ASR task # It expects the raw audio bytes transcript_result = hf_inference.automatic_speech_recognition( audio=audio_bytes, model=ASR_MODEL ) transcript = transcript_result.get('text', '').strip() # Get text, default to '', remove whitespace if not transcript: print("Warning: Transcription result was empty.") # Decide: return empty string or raise error? Let's return empty for now. print("Transcription successful.") return transcript except Exception as e: print(f"ERROR: Hugging Face ASR API call failed: {e}") # Check for specific HF error types if possible raise HTTPException(status_code=503, detail=f"Transcription service failed: {e}") # 503 Service Unavailable def generate_notes_from_transcript(transcript: str) -> str: """ Sends the transcript to OpenRouter LLM (chat model) and gets structured notes back. """ if not OPENROUTER_API_KEY: raise HTTPException(status_code=503, detail="OpenRouter API key not found.") if not transcript: return "Transcript was empty." print(f"Generating notes for transcript (length {len(transcript)}) using {LLM_MODEL}...") # --- Prompt Engineering: Crucial for good results! --- # Be explicit about the desired output format and role. prompt = f"""You are an expert note-taking assistant specializing in extracting key information from video transcripts. Please analyze the following transcript and generate concise, well-structured notes. Focus on the main topics, key points, important examples, definitions, and any conclusions presented. Use bullet points or numbered lists for clarity. Transcript: \"\"\" {transcript} \"\"\" Structured Notes:""" try: response = openai.ChatCompletion.create( model=LLM_MODEL, messages=[{"role": "user", "content": prompt}], max_tokens=1024, temperature=0.7 ) notes = response.choices[0].message.content print("Note generation successful.") return notes except Exception as e: print(f"OpenRouter call failed: {e}") raise HTTPException(status_code=503, detail=f"OpenRouter failed: {e}") # --- API Endpoints --- @app.get("/") async def read_root(): """ Health check endpoint. Visit BASE_URL/ to see this. """ return {"message": "YouTube Notes API is running!"} @app.get("/docs", include_in_schema=False) async def custom_swagger_ui_html(): """ Access automatic API documentation via BASE_URL/docs """ from fastapi.openapi.docs import get_swagger_ui_html return get_swagger_ui_html(openapi_url=app.openapi_url, title=app.title + " - Docs") @app.get(app.openapi_url, include_in_schema=False) async def get_open_api_endpoint(): """ Serves the OpenAPI schema """ from fastapi.openapi.utils import get_openapi return get_openapi(title=app.title, version=app.version, routes=app.routes) @app.post("/process-video/") async def process_video(request: ProcessRequest): """ The main endpoint: receives a YouTube URL, processes it, and returns notes. Accessible via POST request to BASE_URL/process-video/ """ print(f"Received request for URL: {request.youtubeUrl}") try: # Step 1: Download Audio audio_bytes, audio_format = download_audio_bytes(request.youtubeUrl) # Step 2: Chunk and Transcribe Audio audio_chunks = chunk_audio(audio_bytes, audio_format) full_transcript = "" for idx, chunk in enumerate(audio_chunks): print(f"Transcribing chunk {idx + 1}/{len(audio_chunks)}...") try: partial = transcribe_audio(chunk) full_transcript += partial + " " except Exception as e: print(f"Skipping chunk {idx + 1} due to error: {e}") # Step 3: Generate Notes notes = generate_notes_from_transcript(full_transcript.strip()) # Step 4: Return Notes print("Processing complete. Returning notes.") return {"notes": notes} except HTTPException as http_exc: # If an HTTPException was raised deliberately in helpers, re-raise it print(f"HTTP Exception occurred: {http_exc.detail}") raise http_exc except Exception as e: # Catch any other unexpected errors during the process print(f"ERROR: Unhandled exception in /process-video/ endpoint: {e}") # Log traceback here in production: traceback.print_exc() raise HTTPException(status_code=500, detail=f"An internal server error occurred: {e}")