Spaces:

mamogasr
/

llm_engineering

Sleeping

File size: 5,504 Bytes

5fdb69e

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports\n",
    "\n",
    "import os\n",
    "from dotenv import load_dotenv\n",
    "from IPython.display import Markdown, display\n",
    "from openai import OpenAI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install youtube_transcript_api"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "from youtube_transcript_api import YouTubeTranscriptApi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load environment variables in a file called .env\n",
    "\n",
    "load_dotenv(override=True)\n",
    "api_key = os.getenv('OPENAI_API_KEY')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "class YouTubeWebLink:\n",
    "    def __init__(self, url):\n",
    "        self.url = url\n",
    "        self.video_id = self.get_video_id(url)\n",
    "        self.set_openai_client()\n",
    "        self.set_system_prompt()\n",
    "\n",
    "    def get_video_id(self, url):\n",
    "        \"\"\" extract youtube video id from url with regular expression \"\"\"\n",
    "        regex = r\"(?:v=|be/)([a-zA-Z0-9_-]{11})\"\n",
    "        match = re.search(regex, url)\n",
    "        if match:\n",
    "            return match.group(1)\n",
    "        else:\n",
    "            raise ValueError(\"Probably not a YouTube URL\")\n",
    "        \n",
    "    def set_openai_client(self):\n",
    "        self.openai = OpenAI()\n",
    "        \n",
    "    def set_system_prompt(self, system_prompt=None):\n",
    "        \"\"\" set system prompt from youtube video \"\"\"\n",
    "        self.system_prompt = \"\"\"\n",
    "        You are a skilled explainer and storyteller who specializes in summarizing YouTube video transcripts in a way that's both engaging and informative. \n",
    "        Your task is to:\n",
    "        - Capture key points and main ideas of the video\n",
    "        - Structure your summary with in clear sections\n",
    "        - Include important details, facts, and figures mentioned\n",
    "        - Never end your summary with a \"Conclusion\" section\n",
    "        - Keep the summary short and easy to understand\n",
    "        - Always format your response in markdown for better readability\n",
    "        \"\"\" if system_prompt is None else system_prompt\n",
    "\n",
    "    def get_transcript(self):\n",
    "        \"\"\" get transcript from youtube video \"\"\"\n",
    "        try:\n",
    "            print('Fetching video transcript...')\n",
    "            transcript = YouTubeTranscriptApi.get_transcript(self.video_id)\n",
    "            return \" \".join([item['text'] for item in transcript])\n",
    "        except Exception as e:\n",
    "            print(f\"Error fetching transcript: {e}\")\n",
    "            return None\n",
    "        \n",
    "    def get_summary_from_transcript(self, transcript):\n",
    "        \"\"\" summarize text using openai \"\"\"\n",
    "        try:\n",
    "            print('Summarizing video...')\n",
    "            response = self.openai.chat.completions.create(\n",
    "                model=\"gpt-4o-mini\",\n",
    "                messages=[\n",
    "                    {\"role\": \"system\", \"content\": self.system_prompt},\n",
    "                    {\"role\": \"user\", \"content\": f\"Summarize the following YouTube video transcript:\\n\\n{transcript}\"}\n",
    "                ]\n",
    "            )\n",
    "            return response.choices[0].message.content\n",
    "        except Exception as e:\n",
    "            print(f\"Error summarizing text: {e}\")\n",
    "            return None\n",
    "\n",
    "    def display_summary(self):\n",
    "        \"\"\" summarize youtube video \"\"\"\n",
    "        transcript = self.get_transcript()\n",
    "        summary = self.get_summary_from_transcript(transcript)\n",
    "        display(Markdown(summary))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "# video link and share link of same youtube video\n",
    "test_url_1 = \"https://www.youtube.com/watch?v=nYy-umCNKPQ&list=PLWHe-9GP9SMMdl6SLaovUQF2abiLGbMjs\"\n",
    "test_url_2 = \"https://youtu.be/nYy-umCNKPQ?si=ILVrQlKT0W71G5pU\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test that we get same id\n",
    "video1, video2 = YouTubeWebLink(test_url_1), YouTubeWebLink(test_url_2)\n",
    "video1.video_id, video2.video_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "video1.display_summary()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llms",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}