File size: 14,380 Bytes
6716568
 
 
 
 
 
ab6db1e
6716568
 
 
cd16225
6716568
 
 
 
 
 
 
 
 
 
ab6db1e
 
 
6716568
 
 
 
 
 
 
 
 
 
 
ab6db1e
6716568
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3faf1d8
 
 
 
 
 
 
 
 
fb1ef83
725e2f6
 
 
03986eb
c1ef5cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb1ef83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd16225
 
c1ef5cd
 
 
 
 
 
 
 
 
 
03986eb
cd16225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03986eb
 
 
 
 
 
cd16225
 
03986eb
 
 
 
 
cd16225
 
 
 
 
03986eb
cd16225
03986eb
 
cd16225
03986eb
 
cd16225
 
 
 
28eae3c
 
 
 
ab6db1e
28eae3c
ab6db1e
 
28eae3c
ab6db1e
28eae3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab6db1e
28eae3c
ab6db1e
 
 
28eae3c
ab6db1e
28eae3c
 
 
 
ab6db1e
 
cd16225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
# main.py

import os
import io                     # For handling bytes data in memory
import yt_dlp                 # YouTube audio downloader
import requests               # For making HTTP requests (to audio URLs)
import openai                 # for making requests to openrouter
from fastapi import FastAPI, HTTPException, Request # The web framework
from fastapi.middleware.cors import CORSMiddleware # For allowing frontend access
from pydantic import BaseModel # For data validation
from pydub import AudioSegment  # For splitting audio into chunks
from huggingface_hub import InferenceClient # HF API client
from dotenv import load_dotenv # To load .env file locally

# --- Initial Setup ---

# Load environment variables from .env file (for local development)
# In HF Spaces, secrets are set in the Space settings, not via .env
load_dotenv()

HF_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
OPENROUTER_API_KEY= os.getenv("OPENROUTER_API_KEY")
openai.api_key = OPENROUTER_API_KEY
openai.api_base = "https://openrouter.ai/api/v1"

# Check if the API key is loaded (crucial!)
if not HF_API_KEY:
    print("ERROR: HUGGINGFACE_API_KEY environment variable not found.")
    # I might want to exit or raise an error here in a real deployment
    # For now, we'll let it proceed but API calls will fail later.

# Define the models we'll use from Hugging Face
# I can change these! Smaller Whisper models (base, small, medium) are faster.
# Different LLMs have different strengths.
ASR_MODEL = "openai/whisper-large-v3"
LLM_MODEL = "microsoft/mai-ds-r1:free"

# Initialize the Hugging Face Inference Client
# Handles authentication using the API key automatically
try:
    hf_inference = InferenceClient(token=HF_API_KEY)
    print("Hugging Face Inference Client initialized.")
except Exception as e:
    print(f"ERROR: Failed to initialize Hugging Face Inference Client: {e}")
    hf_inference = None # Ensure it's None if initialization fails

# Initialize the FastAPI application
app = FastAPI(
    title="Video Note Taker API",
    description="Transcribes videos and generates notes using Hugging Face models.",
    version="0.1.0",
)

# --- CORS Configuration ---
# Configure Cross-Origin Resource Sharing (CORS)
# This is VITAL to allow Vercel frontend (running on a different domain)
# to make requests to this backend API.
origins = [
    "http://localhost:3000", # Allow my local frontend dev server
    # !!! IMPORTANT: Add my DEPLOYED Vercel frontend URL here later !!!
    # Example: "https://videos-notes-app.vercel.app",
]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins, # List of allowed origins
    allow_credentials=True, # Allow cookies (not strictly needed now, but good practice)
    allow_methods=["*"], # Allow all HTTP methods (GET, POST, etc.)
    allow_headers=["*"], # Allow all headers
)

# --- Data Models (Request Validation) ---

# Define the expected structure of the request body using Pydantic
class ProcessRequest(BaseModel):
    youtubeUrl: str # Expecting a field named "youtubeUrl" which is a string

def download_audio_bytes(youtube_url: str) -> bytes:
    
    """
    Downloads the best audio-only format from a YouTube URL using yt-dlp and returns the raw audio data as bytes.
    """

    print(f"Attempting to download audio for: {youtube_url}")
    ydl_opts = {
        'format': 'bestaudio/best', # Prioritize best audio-only, fallback to best audio in general
        'noplaylist': True,       # Don't download playlist if URL is part of one
        'quiet': True,            # Suppress yt-dlp console output
        'no_warnings': True,
        'postprocessors': [{       # Use ffmpeg (if installed) to extract audio if needed
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3', # Request MP3 format (widely compatible)
            'preferredquality': '128', # Lower quality = smaller file = faster processing
        }],
        # Limit duration - uncomment and adjust if needed to prevent very long processing
        # 'download_ranges': yt_dlp.utils.download_range_func(None, [(0, 1200)]), # Example: Max 20 minutes (1200 seconds)
    }

    buffer = io.BytesIO() # Create an in-memory binary buffer

    try:
        # Use yt-dlp's ability to write to a file-like object
        ydl_opts['outtmpl'] = '-' # Special template meaning stdout
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             # Trick: Use a hook to capture stdout to our buffer instead of printing
             # This is complex; simpler method below is preferred if ffmpeg isn't used
             # Or, a better way: get the direct audio URL first

             # --- Simpler & Often Better Approach: Get URL, then download with requests ---
             info = ydl.extract_info(youtube_url, download=False) # Get info without downloading yet
             best_audio_format = None
             for f in info.get('formats', []):
                 # Look for formats processed by FFmpegExtractAudio or good audio codecs
                 if f.get('acodec') != 'none' and f.get('vcodec') == 'none': # Audio-only
                     if f.get('ext') in ['mp3', 'opus', 'm4a', 'webm']: # Prefer known good audio containers/codecs
                         best_audio_format = f
                         break # Take the first good one

             # Fallback if no ideal format found
             if not best_audio_format:
                 for f in info.get('formats', []):
                     if f.get('acodec') != 'none':
                         best_audio_format = f
                         break # Take first available audio

             if not best_audio_format or 'url' not in best_audio_format:
                 print("Could not find suitable audio stream URL via yt-dlp info. Direct download might fail or require ffmpeg.")
                 # If you *don't* have ffmpeg in the Dockerfile, the postprocessor might fail here
                 # Let's try the download anyway, it might work for some native formats
                 # This path is less reliable without guaranteed ffmpeg.
                 error_info = ydl.download([youtube_url]) # Try downloading directly (might need ffmpeg)
                 # This part is complex - capturing output might need more work if direct URL fetch failed.
                 # Let's raise an error if we couldn't get a direct URL for now.
                 raise yt_dlp.utils.DownloadError("Could not extract a direct audio URL and ffmpeg may not be available.")


             audio_url = best_audio_format['url']
             format_note = best_audio_format.get('format_note', best_audio_format.get('ext', 'N/A'))
             print(f"Found audio format: {format_note}. Downloading directly from URL...")

             # Download the audio URL content into the buffer
             with requests.get(audio_url, stream=True) as r:
                 r.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
                 for chunk in r.iter_content(chunk_size=8192):
                     buffer.write(chunk)

             audio_bytes = buffer.getvalue()
             print(f"Audio downloaded successfully: {len(audio_bytes) / (1024*1024):.2f} MB")
             if not audio_bytes:
                 raise ValueError("Downloaded audio data is empty.")
             audio_format = best_audio_format.get('ext', 'mp3')  # e.g., "webm", "m4a", etc.
             return audio_bytes, audio_format

    except yt_dlp.utils.DownloadError as e:
        print(f"ERROR: yt-dlp failed to download or process audio: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to download audio from YouTube: {e}")
    except requests.exceptions.RequestException as e:
        print(f"ERROR: Failed to download audio stream from URL: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to fetch audio stream: {e}")
    except Exception as e:
        print(f"ERROR: Unexpected error during audio download: {e}")
        # Log the full traceback here in a real app: import traceback; traceback.print_exc()
        raise HTTPException(status_code=500, detail=f"An unexpected error occurred during audio processing: {e}")
    

def chunk_audio(audio_bytes: bytes, audio_format: str = "mp3", chunk_length_ms: int = 30000) -> list[bytes]:
    """
    Splits raw audio bytes into smaller chunks using the specified format.

    Args:
        audio_bytes (bytes): Raw audio data.
        audio_format (str): The format of the audio (e.g., 'mp3', 'webm', 'm4a').
        chunk_length_ms (int): Duration of each chunk in milliseconds.

    Returns:
        List[bytes]: List of audio chunks as bytes.
    """
    try:
        audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format=audio_format)
    except Exception as e:
        raise ValueError(f"Could not decode audio with format '{audio_format}': {e}")

    chunks = []
    total_length = len(audio)

    for i in range(0, total_length, chunk_length_ms):
        chunk = audio[i:i+chunk_length_ms]
        chunk_buffer = io.BytesIO()
        chunk.export(chunk_buffer, format="mp3")  # Export to mp3 regardless of input format
        chunks.append(chunk_buffer.getvalue())

    return chunks


def transcribe_audio(audio_bytes: bytes) -> str:
    """
    Sends audio bytes to the Hugging Face ASR (Automatic Speech Recognition) API.
    """
    if not hf_inference:
        raise HTTPException(status_code=503, detail="Transcription service client not initialized.")
    if not audio_bytes:
        raise ValueError("Cannot transcribe empty audio data.")

    print(f"Transcribing {len(audio_bytes) / (1024*1024):.2f} MB using {ASR_MODEL}...")
    try:
        # Use the InferenceClient for ASR task
        # It expects the raw audio bytes
        transcript_result = hf_inference.automatic_speech_recognition(
            audio=audio_bytes,
            model=ASR_MODEL
        )
        transcript = transcript_result.get('text', '').strip() # Get text, default to '', remove whitespace
        if not transcript:
            print("Warning: Transcription result was empty.")
            # Decide: return empty string or raise error? Let's return empty for now.
        print("Transcription successful.")
        return transcript
    except Exception as e:
        print(f"ERROR: Hugging Face ASR API call failed: {e}")
        # Check for specific HF error types if possible
        raise HTTPException(status_code=503, detail=f"Transcription service failed: {e}") # 503 Service Unavailable


def generate_notes_from_transcript(transcript: str) -> str:
    """
    Sends the transcript to OpenRouter LLM (chat model) and gets structured notes back.
    """
    if not OPENROUTER_API_KEY:
        raise HTTPException(status_code=503, detail="OpenRouter API key not found.")
    if not transcript:
        return "Transcript was empty."

    print(f"Generating notes for transcript (length {len(transcript)}) using {LLM_MODEL}...")

    # --- Prompt Engineering: Crucial for good results! ---
    # Be explicit about the desired output format and role.
    prompt = f"""You are an expert note-taking assistant specializing in extracting key information from video transcripts. 
    Please analyze the following transcript and generate concise, well-structured notes. 
    Focus on the main topics, key points, important examples, definitions, and any conclusions presented. Use bullet points or numbered lists for clarity.

    Transcript:
    \"\"\"
    {transcript}
    \"\"\"

    Structured Notes:"""

    try:
        response = openai.ChatCompletion.create(
            model=LLM_MODEL,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=1024,
            temperature=0.7
        )
        notes = response.choices[0].message.content
        print("Note generation successful.")
        return notes

    except Exception as e:
        print(f"OpenRouter call failed: {e}")
        raise HTTPException(status_code=503, detail=f"OpenRouter failed: {e}")
    

# --- API Endpoints ---

@app.get("/")
async def read_root():
    """ Health check endpoint. Visit BASE_URL/ to see this. """
    return {"message": "YouTube Notes API is running!"}

@app.get("/docs", include_in_schema=False)
async def custom_swagger_ui_html():
    """ Access automatic API documentation via BASE_URL/docs """
    from fastapi.openapi.docs import get_swagger_ui_html
    return get_swagger_ui_html(openapi_url=app.openapi_url, title=app.title + " - Docs")

@app.get(app.openapi_url, include_in_schema=False)
async def get_open_api_endpoint():
     """ Serves the OpenAPI schema """
     from fastapi.openapi.utils import get_openapi
     return get_openapi(title=app.title, version=app.version, routes=app.routes)


@app.post("/process-video/")
async def process_video(request: ProcessRequest):
    """
    The main endpoint: receives a YouTube URL, processes it, and returns notes.
    Accessible via POST request to BASE_URL/process-video/
    """
    print(f"Received request for URL: {request.youtubeUrl}")
    try:
        # Step 1: Download Audio
        audio_bytes, audio_format = download_audio_bytes(request.youtubeUrl)

        # Step 2: Chunk and Transcribe Audio
        audio_chunks = chunk_audio(audio_bytes, audio_format)
        full_transcript = ""
        for idx, chunk in enumerate(audio_chunks):
            print(f"Transcribing chunk {idx + 1}/{len(audio_chunks)}...")
            try:
                partial = transcribe_audio(chunk)
                full_transcript += partial + " "
            except Exception as e:
                print(f"Skipping chunk {idx + 1} due to error: {e}")

        # Step 3: Generate Notes
        notes = generate_notes_from_transcript(full_transcript.strip())

        # Step 4: Return Notes
        print("Processing complete. Returning notes.")
        return {"notes": notes}

    except HTTPException as http_exc:
        # If an HTTPException was raised deliberately in helpers, re-raise it
        print(f"HTTP Exception occurred: {http_exc.detail}")
        raise http_exc
    except Exception as e:
        # Catch any other unexpected errors during the process
        print(f"ERROR: Unhandled exception in /process-video/ endpoint: {e}")
        # Log traceback here in production: traceback.print_exc()
        raise HTTPException(status_code=500, detail=f"An internal server error occurred: {e}")