Spaces:

mamogasr
/

llm_engineering

Sleeping

File size: 14,363 Bytes

5fdb69e

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e71d7ff9-c27a-4602-9230-856626b1de07",
   "metadata": {},
   "source": [
    "# Company Brochure Generator UI\n",
    "Generates a brochure for a company website, after scraping the website and pages linked with that page, based on the provided company URL. \n",
    "Enables users to \n",
    "- Choose a model type (Llama 3.2, Claude, GPT)-\n",
    "- Choose the tone preference\n",
    "- Choose the target audience"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "de9b59b9-8673-42e7-8849-62fe30f56711",
   "metadata": {},
   "source": [
    "#### Imports, Keys, Instantiation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "39fd7fed-b215-4037-bd6e-7e1af1b83897",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import requests\n",
    "import json\n",
    "from typing import List\n",
    "from dotenv import load_dotenv\n",
    "from bs4 import BeautifulSoup\n",
    "from IPython.display import Markdown, display, update_display\n",
    "from openai import OpenAI\n",
    "import anthropic\n",
    "import gradio as gr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "0bf24357-1d77-4721-9d5a-f99827b2158c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "OpenAI API Key exists and begins sk-proj-\n",
      "Anthropic API Key exists and begins sk-ant-\n"
     ]
    }
   ],
   "source": [
    "# Load environment variables in a file called .env\n",
    "\n",
    "load_dotenv(override=True)\n",
    "openai_api_key = os.getenv('OPENAI_API_KEY')\n",
    "anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n",
    "\n",
    "if openai_api_key:\n",
    "    print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n",
    "else:\n",
    "    print(\"OpenAI API Key not set\")\n",
    "    \n",
    "if anthropic_api_key:\n",
    "    print(f\"Anthropic API Key exists and begins {anthropic_api_key[:7]}\")\n",
    "else:\n",
    "    print(\"Anthropic API Key not set\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "1afc12e1-02c1-4394-b589-19cd08d2a8bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define models\n",
    "CLAUDE_MODEL = \"claude-3-haiku-20240307\"\n",
    "GPT_MODEL = \"gpt-4o-mini\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "d5d79a69-0a39-4ab4-aaf8-bc591bce0536",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Creating instances\n",
    "claude = anthropic.Anthropic()\n",
    "openai = OpenAI()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1d3369bc-b751-4f4d-a288-d7d81c384e67",
   "metadata": {},
   "source": [
    "#### Web Scraper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "fafe1074-fbf4-47cc-80dc-34413a447977",
   "metadata": {},
   "outputs": [],
   "source": [
    "# A class to represent a Webpage\n",
    "\n",
    "# Some websites need you to use proper headers when fetching them:\n",
    "headers = {\n",
    " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
    "}\n",
    "\n",
    "class Website:\n",
    "    \"\"\"\n",
    "    A utility class to represent a Website that we have scraped, now with links\n",
    "    \"\"\"\n",
    "\n",
    "    def __init__(self, url):\n",
    "        self.url = url\n",
    "        response = requests.get(url, headers=headers)\n",
    "        self.body = response.content\n",
    "        soup = BeautifulSoup(self.body, 'html.parser')\n",
    "        self.title = soup.title.string if soup.title else \"No title found\"\n",
    "        if soup.body:\n",
    "            for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
    "                irrelevant.decompose()\n",
    "            self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n",
    "        else:\n",
    "            self.text = \"\"\n",
    "        links = [link.get('href') for link in soup.find_all('a')]\n",
    "        self.links = [link for link in links if link]\n",
    "\n",
    "    def get_contents(self):\n",
    "        return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "41c1f1af-ae20-423b-bf7c-efd7f8c2751b",
   "metadata": {},
   "outputs": [],
   "source": [
    "link_system_prompt = \"You are provided with a list of links found on a webpage. \\\n",
    "You are able to decide which of the links would be most relevant to include in a brochure about the company, \\\n",
    "such as links to an About page, or a Company page, or Careers/Jobs pages.\\n\"\n",
    "link_system_prompt += \"You should respond in JSON as in this example:\"\n",
    "link_system_prompt += \"\"\"\n",
    "{\n",
    "    \"links\": [\n",
    "        {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n",
    "        {\"type\": \"careers page\": \"url\": \"https://another.full.url/careers\"}\n",
    "    ]\n",
    "}\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "eb537563-e393-47ca-9af2-a8ea7393edd9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_links_user_prompt(website):\n",
    "    user_prompt = f\"Here is the list of links on the website of {website.url} - \"\n",
    "    user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \\\n",
    "Do not include Terms of Service, Privacy, email or social media links.\\n\"\n",
    "    user_prompt += \"Links (some might be relative links):\\n\"\n",
    "    user_prompt += \"\\n\".join(website.links)\n",
    "    return user_prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "033568d2-3f1a-43ac-a288-7a65b4ea86a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_links(url):\n",
    "    website = Website(url)\n",
    "    response = openai.chat.completions.create(\n",
    "        model=GPT_MODEL,\n",
    "        messages=[\n",
    "            {\"role\": \"system\", \"content\": link_system_prompt},\n",
    "            {\"role\": \"user\", \"content\": get_links_user_prompt(website)}\n",
    "      ],\n",
    "        response_format={\"type\": \"json_object\"}\n",
    "    )\n",
    "    result = response.choices[0].message.content\n",
    "    return json.loads(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "d8f316ac-f0b1-42d9-88a8-0a61fcb0023d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_all_details(url):\n",
    "    result = \"Landing page:\\n\"\n",
    "    result += Website(url).get_contents()\n",
    "    links = get_links(url)\n",
    "    print(\"Found links:\", links)\n",
    "    for link in links[\"links\"]:\n",
    "        print(f\"Processing {link['url']}...\")\n",
    "        result += f\"\\n\\n{link['type']}\\n\"\n",
    "        result += Website(link[\"url\"]).get_contents()\n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "016e065a-ac5a-48c0-bc4b-e916e9801384",
   "metadata": {},
   "source": [
    "#### System Message"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "ed1c6068-5f4f-47a7-ab97-738dfb94e057",
   "metadata": {},
   "outputs": [],
   "source": [
    "system_message = \"You are an assistant that analyzes the contents of a company website landing page \\\n",
    "and creates a short brochure about the company for prospective customers, investors and recruits. \\\n",
    "You are also provided with the tone, and the target audience. Provide an appropriate answer. Respond in markdown.\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6d4f594c-927d-440f-8aae-33cfeb9c445c",
   "metadata": {},
   "source": [
    "#### LLM Call Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "5b6a0379-3465-4c04-a553-4e4cdb9064b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def stream_gpt(prompt,company_name,url):\n",
    "    messages = [\n",
    "        {\"role\": \"user\", \"content\": prompt},\n",
    "        {\"role\":\"system\",\"content\":system_message}\n",
    "    ]\n",
    "    stream = openai.chat.completions.create(\n",
    "        model=GPT_MODEL,\n",
    "        messages=messages,\n",
    "        stream=True\n",
    "    )\n",
    "    result = \"\"\n",
    "    for chunk in stream:\n",
    "        result += chunk.choices[0].delta.content or \"\"\n",
    "        yield result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "a2194e1d-4e99-4127-9515-aa9353382bc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def stream_claude(prompt):\n",
    "    result = claude.messages.stream(\n",
    "        model=CLAUDE_MODEL,\n",
    "        max_tokens=1000,\n",
    "        temperature=0.7,\n",
    "        system=system_message,\n",
    "        messages=[\n",
    "            {\"role\": \"user\", \"content\": prompt},\n",
    "        ],\n",
    "    )\n",
    "    response = \"\"\n",
    "    with result as stream:\n",
    "        for text in stream.text_stream:\n",
    "            response += text or \"\"\n",
    "            yield response"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "64adf26c-33b2-4589-8df6-dc5d6da71420",
   "metadata": {},
   "source": [
    "#### Brochure Creation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "8192f39f-508b-4592-a075-767db68672b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_brochure_user_prompt(company_name, url):\n",
    "    user_prompt = f\"You are looking at a company called: {company_name}\\n\"\n",
    "    user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\\n\"\n",
    "    user_prompt += get_all_details(url)\n",
    "    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n",
    "    return user_prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "8aebfabe-4d51-4ee7-a9d2-5a379e9427cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_brochure(company_name, url,model,tone,target):\n",
    "    print('create brochure function called')\n",
    "    prompt = f\"Please generate a company brochure for {company_name}.\"\n",
    "    prompt += f\"Use a {tone} tone; and target content at {target}\"\n",
    "    prompt += get_brochure_user_prompt(company_name,url)\n",
    "    \n",
    "    if model == \"GPT\":\n",
    "        result = stream_gpt(prompt,company_name,url)\n",
    "    elif model==\"Claude\":\n",
    "        result = stream_claude(prompt,company_name,url)\n",
    "    else:\n",
    "        raise ValueError(\"Unknown model\")\n",
    "    yield from result"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c5f4f97b-c9d0-4d4c-8b02-e6209ba2549c",
   "metadata": {},
   "source": [
    "#### Putting it all together : Gradio UI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "33162303-9b49-46fe-a8e0-0d01be45685b",
   "metadata": {},
   "outputs": [],
   "source": [
    "force_dark_mode = \"\"\"\n",
    "function refresh() {\n",
    "    const url = new URL(window.location);\n",
    "    if (url.searchParams.get('__theme') !== 'dark') {\n",
    "        url.searchParams.set('__theme', 'dark');\n",
    "        window.location.href = url.href;\n",
    "    }\n",
    "}\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "47ab9a41-cecd-4c21-bd68-4a15966b80c4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "* Running on local URL:  http://127.0.0.1:7877\n",
      "\n",
      "To create a public link, set `share=True` in `launch()`.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div><iframe src=\"http://127.0.0.1:7877/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": []
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found links: {'links': [{'type': 'about page', 'url': 'https://www.vellum.ai/'}, {'type': 'careers page', 'url': 'https://www.vellum.ai/careers'}]}\n",
      "Processing https://www.vellum.ai/...\n",
      "Processing https://www.vellum.ai/careers...\n"
     ]
    }
   ],
   "source": [
    "gr.Interface(\n",
    "    fn=create_brochure,\n",
    "    inputs=[\n",
    "        gr.Textbox(label='Company Name:'),\n",
    "        gr.Textbox(label=\"Landing page URL including http:// or https://\"),\n",
    "        gr.Dropdown(['GPT','Claude'],label='Select Model:'),\n",
    "        gr.Dropdown(['Formal','Casual','Persuasive','Informative','Conversational'],label='Select Tone:'),\n",
    "        gr.Dropdown(['Businesses','General Public','Students','Investors','Customers'],label='Select Target Audience:'),\n",
    "    ],\n",
    "    outputs = [gr.Markdown(label='Brochure')],\n",
    "    flagging_mode = 'never',\n",
    "    js = force_dark_mode\n",
    ").launch(inbrowser=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b923b09-6738-450a-9035-2c8d1bb9cae6",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}