# main.py | |
import os | |
import io # For handling bytes data in memory | |
import yt_dlp # YouTube audio downloader | |
import requests # For making HTTP requests (to audio URLs) | |
from fastapi import FastAPI, HTTPException, Request # The web framework | |
from fastapi.middleware.cors import CORSMiddleware # For allowing frontend access | |
from pydantic import BaseModel # For data validation | |
from huggingface_hub import InferenceClient # HF API client | |
from dotenv import load_dotenv # To load .env file locally | |
# --- Initial Setup --- | |
# Load environment variables from .env file (for local development) | |
# In HF Spaces, secrets are set in the Space settings, not via .env | |
load_dotenv() | |
HF_API_KEY = os.getenv("HUGGINGFACE_API_KEY") | |
# Check if the API key is loaded (crucial!) | |
if not HF_API_KEY: | |
print("ERROR: HUGGINGFACE_API_KEY environment variable not found.") | |
# I might want to exit or raise an error here in a real deployment | |
# For now, we'll let it proceed but API calls will fail later. | |
# Define the models we'll use from Hugging Face | |
# I can change these! Smaller Whisper models (base, small, medium) are faster. | |
# Different LLMs have different strengths. | |
ASR_MODEL = "openai/whisper-large-v3" | |
LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.2" | |
# Initialize the Hugging Face Inference Client | |
# Handles authentication using the API key automatically | |
try: | |
hf_inference = InferenceClient(token=HF_API_KEY) | |
print("Hugging Face Inference Client initialized.") | |
except Exception as e: | |
print(f"ERROR: Failed to initialize Hugging Face Inference Client: {e}") | |
hf_inference = None # Ensure it's None if initialization fails | |
# Initialize the FastAPI application | |
app = FastAPI( | |
title="Video Note Taker API", | |
description="Transcribes videos and generates notes using Hugging Face models.", | |
version="0.1.0", | |
) | |
# --- CORS Configuration --- | |
# Configure Cross-Origin Resource Sharing (CORS) | |
# This is VITAL to allow Vercel frontend (running on a different domain) | |
# to make requests to this backend API. | |
origins = [ | |
"http://localhost:3000", # Allow my local frontend dev server | |
# !!! IMPORTANT: Add my DEPLOYED Vercel frontend URL here later !!! | |
# Example: "https://videos-notes-app.vercel.app", | |
] | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=origins, # List of allowed origins | |
allow_credentials=True, # Allow cookies (not strictly needed now, but good practice) | |
allow_methods=["*"], # Allow all HTTP methods (GET, POST, etc.) | |
allow_headers=["*"], # Allow all headers | |
) | |
# --- Data Models (Request Validation) --- | |
# Define the expected structure of the request body using Pydantic | |
class ProcessRequest(BaseModel): | |
youtubeUrl: str # Expecting a field named "youtubeUrl" which is a string | |
def download_audio_bytes(youtube_url: str) -> bytes: | |
# Downloads the best audio-only format from a YouTube URL using yt-dlp and returns the raw audio data as bytes. | |
print(f"Attempting to download audio for: {youtube_url}") | |
ydl_opts = { | |
'format': 'bestaudio/best', # Prioritize best audio-only, fallback to best audio in general | |
'noplaylist': True, # Don't download playlist if URL is part of one | |
'quiet': True, # Suppress yt-dlp console output | |
'no_warnings': True, | |
'postprocessors': [{ # Use ffmpeg (if installed) to extract audio if needed | |
'key': 'FFmpegExtractAudio', | |
'preferredcodec': 'mp3', # Request MP3 format (widely compatible) | |
'preferredquality': '128', # Lower quality = smaller file = faster processing | |
}], | |
# Limit duration - uncomment and adjust if needed to prevent very long processing | |
# 'download_ranges': yt_dlp.utils.download_range_func(None, [(0, 1200)]), # Example: Max 20 minutes (1200 seconds) | |
} | |
buffer = io.BytesIO() # Create an in-memory binary buffer | |
try: | |
# Use yt-dlp's ability to write to a file-like object | |
ydl_opts['outtmpl'] = '-' # Special template meaning stdout | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
# Trick: Use a hook to capture stdout to our buffer instead of printing | |
# This is complex; simpler method below is preferred if ffmpeg isn't used | |
# Or, a better way: get the direct audio URL first | |
# --- Simpler & Often Better Approach: Get URL, then download with requests --- | |
info = ydl.extract_info(youtube_url, download=False) # Get info without downloading yet | |
best_audio_format = None | |
for f in info.get('formats', []): | |
# Look for formats processed by FFmpegExtractAudio or good audio codecs | |
if f.get('acodec') != 'none' and f.get('vcodec') == 'none': # Audio-only | |
if f.get('ext') in ['mp3', 'opus', 'm4a', 'webm']: # Prefer known good audio containers/codecs | |
best_audio_format = f | |
break # Take the first good one | |
# Fallback if no ideal format found | |
if not best_audio_format: | |
for f in info.get('formats', []): | |
if f.get('acodec') != 'none': | |
best_audio_format = f | |
break # Take first available audio | |
if not best_audio_format or 'url' not in best_audio_format: | |
print("Could not find suitable audio stream URL via yt-dlp info. Direct download might fail or require ffmpeg.") | |
# If you *don't* have ffmpeg in the Dockerfile, the postprocessor might fail here | |
# Let's try the download anyway, it might work for some native formats | |
# This path is less reliable without guaranteed ffmpeg. | |
error_info = ydl.download([youtube_url]) # Try downloading directly (might need ffmpeg) | |
# This part is complex - capturing output might need more work if direct URL fetch failed. | |
# Let's raise an error if we couldn't get a direct URL for now. | |
raise yt_dlp.utils.DownloadError("Could not extract a direct audio URL and ffmpeg may not be available.") | |
audio_url = best_audio_format['url'] | |
format_note = best_audio_format.get('format_note', best_audio_format.get('ext', 'N/A')) | |
print(f"Found audio format: {format_note}. Downloading directly from URL...") | |
# Download the audio URL content into the buffer | |
with requests.get(audio_url, stream=True) as r: | |
r.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx) | |
for chunk in r.iter_content(chunk_size=8192): | |
buffer.write(chunk) | |
audio_bytes = buffer.getvalue() | |
print(f"Audio downloaded successfully: {len(audio_bytes) / (1024*1024):.2f} MB") | |
if not audio_bytes: | |
raise ValueError("Downloaded audio data is empty.") | |
return audio_bytes | |
except yt_dlp.utils.DownloadError as e: | |
print(f"ERROR: yt-dlp failed to download or process audio: {e}") | |
raise HTTPException(status_code=500, detail=f"Failed to download audio from YouTube: {e}") | |
except requests.exceptions.RequestException as e: | |
print(f"ERROR: Failed to download audio stream from URL: {e}") | |
raise HTTPException(status_code=500, detail=f"Failed to fetch audio stream: {e}") | |
except Exception as e: | |
print(f"ERROR: Unexpected error during audio download: {e}") | |
# Log the full traceback here in a real app: import traceback; traceback.print_exc() | |
raise HTTPException(status_code=500, detail=f"An unexpected error occurred during audio processing: {e}") |