Spaces:

OmarHusseinZaki
/

vid-to-notes-backend

Running

App Files Files Community

vid-to-notes-backend / main.py

OmarHusseinZaki

using openrouter ai chat instead of hf models

ab6db1e 4 days ago

raw

history blame contribute delete

14.4 kB

	# main.py

	import os
	import io # For handling bytes data in memory
	import yt_dlp # YouTube audio downloader
	import requests # For making HTTP requests (to audio URLs)
	import openai # for making requests to openrouter
	from fastapi import FastAPI, HTTPException, Request # The web framework
	from fastapi.middleware.cors import CORSMiddleware # For allowing frontend access
	from pydantic import BaseModel # For data validation
	from pydub import AudioSegment # For splitting audio into chunks
	from huggingface_hub import InferenceClient # HF API client
	from dotenv import load_dotenv # To load .env file locally

	# --- Initial Setup ---

	# Load environment variables from .env file (for local development)
	# In HF Spaces, secrets are set in the Space settings, not via .env
	load_dotenv()

	HF_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
	OPENROUTER_API_KEY= os.getenv("OPENROUTER_API_KEY")
	openai.api_key = OPENROUTER_API_KEY
	openai.api_base = "https://openrouter.ai/api/v1"

	# Check if the API key is loaded (crucial!)
	if not HF_API_KEY:
	print("ERROR: HUGGINGFACE_API_KEY environment variable not found.")
	# I might want to exit or raise an error here in a real deployment
	# For now, we'll let it proceed but API calls will fail later.

	# Define the models we'll use from Hugging Face
	# I can change these! Smaller Whisper models (base, small, medium) are faster.
	# Different LLMs have different strengths.
	ASR_MODEL = "openai/whisper-large-v3"
	LLM_MODEL = "microsoft/mai-ds-r1:free"

	# Initialize the Hugging Face Inference Client
	# Handles authentication using the API key automatically
	try:
	hf_inference = InferenceClient(token=HF_API_KEY)
	print("Hugging Face Inference Client initialized.")
	except Exception as e:
	print(f"ERROR: Failed to initialize Hugging Face Inference Client: {e}")
	hf_inference = None # Ensure it's None if initialization fails

	# Initialize the FastAPI application
	app = FastAPI(
	title="Video Note Taker API",
	description="Transcribes videos and generates notes using Hugging Face models.",
	version="0.1.0",
	)

	# --- CORS Configuration ---
	# Configure Cross-Origin Resource Sharing (CORS)
	# This is VITAL to allow Vercel frontend (running on a different domain)
	# to make requests to this backend API.
	origins = [
	"http://localhost:3000", # Allow my local frontend dev server
	# !!! IMPORTANT: Add my DEPLOYED Vercel frontend URL here later !!!
	# Example: "https://videos-notes-app.vercel.app",
	]

	app.add_middleware(
	CORSMiddleware,
	allow_origins=origins, # List of allowed origins
	allow_credentials=True, # Allow cookies (not strictly needed now, but good practice)
	allow_methods=["*"], # Allow all HTTP methods (GET, POST, etc.)
	allow_headers=["*"], # Allow all headers
	)

	# --- Data Models (Request Validation) ---

	# Define the expected structure of the request body using Pydantic
	class ProcessRequest(BaseModel):
	youtubeUrl: str # Expecting a field named "youtubeUrl" which is a string

	def download_audio_bytes(youtube_url: str) -> bytes:

	"""
	Downloads the best audio-only format from a YouTube URL using yt-dlp and returns the raw audio data as bytes.
	"""

	print(f"Attempting to download audio for: {youtube_url}")
	ydl_opts = {
	'format': 'bestaudio/best', # Prioritize best audio-only, fallback to best audio in general
	'noplaylist': True, # Don't download playlist if URL is part of one
	'quiet': True, # Suppress yt-dlp console output
	'no_warnings': True,
	'postprocessors': [{ # Use ffmpeg (if installed) to extract audio if needed
	'key': 'FFmpegExtractAudio',
	'preferredcodec': 'mp3', # Request MP3 format (widely compatible)
	'preferredquality': '128', # Lower quality = smaller file = faster processing
	}],
	# Limit duration - uncomment and adjust if needed to prevent very long processing
	# 'download_ranges': yt_dlp.utils.download_range_func(None, [(0, 1200)]), # Example: Max 20 minutes (1200 seconds)
	}

	buffer = io.BytesIO() # Create an in-memory binary buffer

	try:
	# Use yt-dlp's ability to write to a file-like object
	ydl_opts['outtmpl'] = '-' # Special template meaning stdout
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	# Trick: Use a hook to capture stdout to our buffer instead of printing
	# This is complex; simpler method below is preferred if ffmpeg isn't used
	# Or, a better way: get the direct audio URL first

	# --- Simpler & Often Better Approach: Get URL, then download with requests ---
	info = ydl.extract_info(youtube_url, download=False) # Get info without downloading yet
	best_audio_format = None
	for f in info.get('formats', []):
	# Look for formats processed by FFmpegExtractAudio or good audio codecs
	if f.get('acodec') != 'none' and f.get('vcodec') == 'none': # Audio-only
	if f.get('ext') in ['mp3', 'opus', 'm4a', 'webm']: # Prefer known good audio containers/codecs
	best_audio_format = f
	break # Take the first good one

	# Fallback if no ideal format found
	if not best_audio_format:
	for f in info.get('formats', []):
	if f.get('acodec') != 'none':
	best_audio_format = f
	break # Take first available audio

	if not best_audio_format or 'url' not in best_audio_format:
	print("Could not find suitable audio stream URL via yt-dlp info. Direct download might fail or require ffmpeg.")
	# If you don't have ffmpeg in the Dockerfile, the postprocessor might fail here
	# Let's try the download anyway, it might work for some native formats
	# This path is less reliable without guaranteed ffmpeg.
	error_info = ydl.download([youtube_url]) # Try downloading directly (might need ffmpeg)
	# This part is complex - capturing output might need more work if direct URL fetch failed.
	# Let's raise an error if we couldn't get a direct URL for now.
	raise yt_dlp.utils.DownloadError("Could not extract a direct audio URL and ffmpeg may not be available.")


	audio_url = best_audio_format['url']
	format_note = best_audio_format.get('format_note', best_audio_format.get('ext', 'N/A'))
	print(f"Found audio format: {format_note}. Downloading directly from URL...")

	# Download the audio URL content into the buffer
	with requests.get(audio_url, stream=True) as r:
	r.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
	for chunk in r.iter_content(chunk_size=8192):
	buffer.write(chunk)

	audio_bytes = buffer.getvalue()
	print(f"Audio downloaded successfully: {len(audio_bytes) / (1024*1024):.2f} MB")
	if not audio_bytes:
	raise ValueError("Downloaded audio data is empty.")
	audio_format = best_audio_format.get('ext', 'mp3') # e.g., "webm", "m4a", etc.
	return audio_bytes, audio_format

	except yt_dlp.utils.DownloadError as e:
	print(f"ERROR: yt-dlp failed to download or process audio: {e}")
	raise HTTPException(status_code=500, detail=f"Failed to download audio from YouTube: {e}")
	except requests.exceptions.RequestException as e:
	print(f"ERROR: Failed to download audio stream from URL: {e}")
	raise HTTPException(status_code=500, detail=f"Failed to fetch audio stream: {e}")
	except Exception as e:
	print(f"ERROR: Unexpected error during audio download: {e}")
	# Log the full traceback here in a real app: import traceback; traceback.print_exc()
	raise HTTPException(status_code=500, detail=f"An unexpected error occurred during audio processing: {e}")


	def chunk_audio(audio_bytes: bytes, audio_format: str = "mp3", chunk_length_ms: int = 30000) -> list[bytes]:
	"""
	Splits raw audio bytes into smaller chunks using the specified format.

	Args:
	audio_bytes (bytes): Raw audio data.
	audio_format (str): The format of the audio (e.g., 'mp3', 'webm', 'm4a').
	chunk_length_ms (int): Duration of each chunk in milliseconds.

	Returns:
	List[bytes]: List of audio chunks as bytes.
	"""
	try:
	audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format=audio_format)
	except Exception as e:
	raise ValueError(f"Could not decode audio with format '{audio_format}': {e}")

	chunks = []
	total_length = len(audio)

	for i in range(0, total_length, chunk_length_ms):
	chunk = audio[i:i+chunk_length_ms]
	chunk_buffer = io.BytesIO()
	chunk.export(chunk_buffer, format="mp3") # Export to mp3 regardless of input format
	chunks.append(chunk_buffer.getvalue())

	return chunks


	def transcribe_audio(audio_bytes: bytes) -> str:
	"""
	Sends audio bytes to the Hugging Face ASR (Automatic Speech Recognition) API.
	"""
	if not hf_inference:
	raise HTTPException(status_code=503, detail="Transcription service client not initialized.")
	if not audio_bytes:
	raise ValueError("Cannot transcribe empty audio data.")

	print(f"Transcribing {len(audio_bytes) / (1024*1024):.2f} MB using {ASR_MODEL}...")
	try:
	# Use the InferenceClient for ASR task
	# It expects the raw audio bytes
	transcript_result = hf_inference.automatic_speech_recognition(
	audio=audio_bytes,
	model=ASR_MODEL
	)
	transcript = transcript_result.get('text', '').strip() # Get text, default to '', remove whitespace
	if not transcript:
	print("Warning: Transcription result was empty.")
	# Decide: return empty string or raise error? Let's return empty for now.
	print("Transcription successful.")
	return transcript
	except Exception as e:
	print(f"ERROR: Hugging Face ASR API call failed: {e}")
	# Check for specific HF error types if possible
	raise HTTPException(status_code=503, detail=f"Transcription service failed: {e}") # 503 Service Unavailable


	def generate_notes_from_transcript(transcript: str) -> str:
	"""
	Sends the transcript to OpenRouter LLM (chat model) and gets structured notes back.
	"""
	if not OPENROUTER_API_KEY:
	raise HTTPException(status_code=503, detail="OpenRouter API key not found.")
	if not transcript:
	return "Transcript was empty."

	print(f"Generating notes for transcript (length {len(transcript)}) using {LLM_MODEL}...")

	# --- Prompt Engineering: Crucial for good results! ---
	# Be explicit about the desired output format and role.
	prompt = f"""You are an expert note-taking assistant specializing in extracting key information from video transcripts.
	Please analyze the following transcript and generate concise, well-structured notes.
	Focus on the main topics, key points, important examples, definitions, and any conclusions presented. Use bullet points or numbered lists for clarity.

	Transcript:
	\"\"\"
	{transcript}
	\"\"\"

	Structured Notes:"""

	try:
	response = openai.ChatCompletion.create(
	model=LLM_MODEL,
	messages=[{"role": "user", "content": prompt}],
	max_tokens=1024,
	temperature=0.7
	)
	notes = response.choices[0].message.content
	print("Note generation successful.")
	return notes

	except Exception as e:
	print(f"OpenRouter call failed: {e}")
	raise HTTPException(status_code=503, detail=f"OpenRouter failed: {e}")


	# --- API Endpoints ---

	@app.get("/")
	async def read_root():
	""" Health check endpoint. Visit BASE_URL/ to see this. """
	return {"message": "YouTube Notes API is running!"}

	@app.get("/docs", include_in_schema=False)
	async def custom_swagger_ui_html():
	""" Access automatic API documentation via BASE_URL/docs """
	from fastapi.openapi.docs import get_swagger_ui_html
	return get_swagger_ui_html(openapi_url=app.openapi_url, title=app.title + " - Docs")

	@app.get(app.openapi_url, include_in_schema=False)
	async def get_open_api_endpoint():
	""" Serves the OpenAPI schema """
	from fastapi.openapi.utils import get_openapi
	return get_openapi(title=app.title, version=app.version, routes=app.routes)


	@app.post("/process-video/")
	async def process_video(request: ProcessRequest):
	"""
	The main endpoint: receives a YouTube URL, processes it, and returns notes.
	Accessible via POST request to BASE_URL/process-video/
	"""
	print(f"Received request for URL: {request.youtubeUrl}")
	try:
	# Step 1: Download Audio
	audio_bytes, audio_format = download_audio_bytes(request.youtubeUrl)

	# Step 2: Chunk and Transcribe Audio
	audio_chunks = chunk_audio(audio_bytes, audio_format)
	full_transcript = ""
	for idx, chunk in enumerate(audio_chunks):
	print(f"Transcribing chunk {idx + 1}/{len(audio_chunks)}...")
	try:
	partial = transcribe_audio(chunk)
	full_transcript += partial + " "
	except Exception as e:
	print(f"Skipping chunk {idx + 1} due to error: {e}")

	# Step 3: Generate Notes
	notes = generate_notes_from_transcript(full_transcript.strip())

	# Step 4: Return Notes
	print("Processing complete. Returning notes.")
	return {"notes": notes}

	except HTTPException as http_exc:
	# If an HTTPException was raised deliberately in helpers, re-raise it
	print(f"HTTP Exception occurred: {http_exc.detail}")
	raise http_exc
	except Exception as e:
	# Catch any other unexpected errors during the process
	print(f"ERROR: Unhandled exception in /process-video/ endpoint: {e}")
	# Log traceback here in production: traceback.print_exc()
	raise HTTPException(status_code=500, detail=f"An internal server error occurred: {e}")