Spaces:

mamogasr
/

llm_engineering

Sleeping

File size: 13,261 Bytes

5fdb69e

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4a6ab9a2-28a2-445d-8512-a0dc8d1b54e9",
   "metadata": {},
   "source": [
    "# Code Commenter\n",
    "\n",
    "The requirement: use an LLM to generate docstring and comments for Python code\n",
    "\n",
    "This is my week 4 day 5 project. \n",
    "\n",
    "Note: I used gpt to find out the most effective system and user prompt (very effective). I also decided not to use the open source models due to inference api costs with HF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e610bf56-a46e-4aff-8de1-ab49d62b1ad3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports\n",
    "\n",
    "import os\n",
    "import io\n",
    "import sys\n",
    "import json\n",
    "import requests\n",
    "from dotenv import load_dotenv\n",
    "from openai import OpenAI\n",
    "import google.generativeai\n",
    "import anthropic\n",
    "from IPython.display import Markdown, display, update_display\n",
    "import gradio as gr\n",
    "import subprocess"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4f672e1c-87e9-4865-b760-370fa605e614",
   "metadata": {},
   "outputs": [],
   "source": [
    "# environment\n",
    "\n",
    "load_dotenv()\n",
    "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
    "os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY', 'your-key-if-not-using-env')\n",
    "google_api_key = os.getenv('GOOGLE_API_KEY')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "8aa149ed-9298-4d69-8fe2-8f5de0f667da",
   "metadata": {},
   "outputs": [],
   "source": [
    "# initialize\n",
    "\n",
    "openai = OpenAI()\n",
    "claude = anthropic.Anthropic()\n",
    "google.generativeai.configure()\n",
    "\n",
    "OPENAI_MODEL = \"gpt-4o\"\n",
    "CLAUDE_MODEL = \"claude-3-5-sonnet-20240620\"\n",
    "GOOGLE_MODEL = \"gemini-1.5-pro\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "6896636f-923e-4a2c-9d6c-fac07828a201",
   "metadata": {},
   "outputs": [],
   "source": [
    "system_message = \"You are a Python code assistant. Your task is to analyze Python code and generate high-quality, concise comments and docstrings. Follow these guidelines:\"\n",
    "system_message += \"Docstrings: Add a docstring for every function, class, and module. Describe the purpose of the function/class, its parameters, and its return value. Keep the description concise but informative, using proper Python docstring conventions (e.g., Google, NumPy, or reStructuredText format).\"\n",
    "system_message += \"Inline Comments: Add inline comments only where necessary to clarify complex logic, important steps, or non-obvious behavior. Avoid commenting on obvious operations like x += 1 unless it involves a nuanced concept. Keep comments short, clear, and relevant.\"\n",
    "system_message += \"General Instructions: Maintain consistency in style and tone. Use technical terminology where appropriate, but ensure clarity for someone with intermediate Python knowledge. Do not over-explain or add redundant comments for self-explanatory code. Follow PEP 257 and PEP 8 standards for style and formatting.\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "8e7b3546-57aa-4c29-bc5d-f211970d04eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def user_prompt_for(python):\n",
    "    user_prompt = \"Analyze the following Python code and enhance it by adding high-quality, concise docstrings and comments. \"\n",
    "    user_prompt += \"Ensure all functions, classes, and modules have appropriate docstrings describing their purpose, parameters, and return values. \"\n",
    "    user_prompt += \"Add inline comments only for complex or non-obvious parts of the code. \"\n",
    "    user_prompt += \"Follow Python's PEP 257 and PEP 8 standards for documentation and formatting. \"\n",
    "    user_prompt += \"Do not modify the code itself; only add annotations.\\n\\n\"\n",
    "    user_prompt += python\n",
    "    return user_prompt\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c6190659-f54c-4951-bef4-4960f8e51cc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def messages_for(python):\n",
    "    return [\n",
    "        {\"role\": \"system\", \"content\": system_message},\n",
    "        {\"role\": \"user\", \"content\": user_prompt_for(python)}\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a1cbb778-fa57-43de-b04b-ed523f396c38",
   "metadata": {},
   "outputs": [],
   "source": [
    "pi = \"\"\"\n",
    "import time\n",
    "\n",
    "def calculate(iterations, param1, param2):\n",
    "    result = 1.0\n",
    "    for i in range(1, iterations+1):\n",
    "        j = i * param1 - param2\n",
    "        result -= (1/j)\n",
    "        j = i * param1 + param2\n",
    "        result += (1/j)\n",
    "    return result\n",
    "\n",
    "start_time = time.time()\n",
    "result = calculate(100_000_000, 4, 1) * 4\n",
    "end_time = time.time()\n",
    "\n",
    "print(f\"Result: {result:.12f}\")\n",
    "print(f\"Execution Time: {(end_time - start_time):.6f} seconds\")\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "c3b497b3-f569-420e-b92e-fb0f49957ce0",
   "metadata": {},
   "outputs": [],
   "source": [
    "python_hard = \"\"\"# Be careful to support large number sizes\n",
    "\n",
    "def lcg(seed, a=1664525, c=1013904223, m=2**32):\n",
    "    value = seed\n",
    "    while True:\n",
    "        value = (a * value + c) % m\n",
    "        yield value\n",
    "        \n",
    "def max_subarray_sum(n, seed, min_val, max_val):\n",
    "    lcg_gen = lcg(seed)\n",
    "    random_numbers = [next(lcg_gen) % (max_val - min_val + 1) + min_val for _ in range(n)]\n",
    "    max_sum = float('-inf')\n",
    "    for i in range(n):\n",
    "        current_sum = 0\n",
    "        for j in range(i, n):\n",
    "            current_sum += random_numbers[j]\n",
    "            if current_sum > max_sum:\n",
    "                max_sum = current_sum\n",
    "    return max_sum\n",
    "\n",
    "def total_max_subarray_sum(n, initial_seed, min_val, max_val):\n",
    "    total_sum = 0\n",
    "    lcg_gen = lcg(initial_seed)\n",
    "    for _ in range(20):\n",
    "        seed = next(lcg_gen)\n",
    "        total_sum += max_subarray_sum(n, seed, min_val, max_val)\n",
    "    return total_sum\n",
    "\n",
    "# Parameters\n",
    "n = 10000         # Number of random numbers\n",
    "initial_seed = 42 # Initial seed for the LCG\n",
    "min_val = -10     # Minimum value of random numbers\n",
    "max_val = 10      # Maximum value of random numbers\n",
    "\n",
    "# Timing the function\n",
    "import time\n",
    "start_time = time.time()\n",
    "result = total_max_subarray_sum(n, initial_seed, min_val, max_val)\n",
    "end_time = time.time()\n",
    "\n",
    "print(\"Total Maximum Subarray Sum (20 runs):\", result)\n",
    "print(\"Execution Time: {:.6f} seconds\".format(end_time - start_time))\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "0be9f47d-5213-4700-b0e2-d444c7c738c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def stream_gpt(python):    \n",
    "    stream = openai.chat.completions.create(model=OPENAI_MODEL, messages=messages_for(python), stream=True)\n",
    "    reply = \"\"\n",
    "    for chunk in stream:\n",
    "        fragment = chunk.choices[0].delta.content or \"\"\n",
    "        reply += fragment\n",
    "        yield reply.replace('```python\\n','').replace('```','')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "8669f56b-8314-4582-a167-78842caea131",
   "metadata": {},
   "outputs": [],
   "source": [
    "def stream_claude(python):\n",
    "    result = claude.messages.stream(\n",
    "        model=CLAUDE_MODEL,\n",
    "        max_tokens=2000,\n",
    "        system=system_message,\n",
    "        messages=[{\"role\": \"user\", \"content\": user_prompt_for(python)}],\n",
    "    )\n",
    "    reply = \"\"\n",
    "    with result as stream:\n",
    "        for text in stream.text_stream:\n",
    "            reply += text\n",
    "            yield reply.replace('```python\\n','').replace('```','')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "25f8d215-67a8-4179-8834-0e1da5a7dd32",
   "metadata": {},
   "outputs": [],
   "source": [
    "def stream_google(python):\n",
    "    # Initialize empty reply string\n",
    "    reply = \"\"\n",
    "    \n",
    "    # The API for Gemini has a slightly different structure\n",
    "    gemini = google.generativeai.GenerativeModel(\n",
    "        model_name=GOOGLE_MODEL,\n",
    "        system_instruction=system_message\n",
    "    )\n",
    "    \n",
    "    response = gemini.generate_content(\n",
    "        user_prompt_for(python),\n",
    "        stream=True\n",
    "    )\n",
    "    \n",
    "    # Process the stream\n",
    "    for chunk in response:\n",
    "        # Extract text from the chunk\n",
    "        if chunk.text:\n",
    "            reply += chunk.text\n",
    "            yield reply.replace('```python\\n','').replace('```','')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "2f1ae8f5-16c8-40a0-aa18-63b617df078d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def optimize(python, model):\n",
    "    if model==\"GPT\":\n",
    "        result = stream_gpt(python)\n",
    "    elif model==\"Claude\":\n",
    "        result = stream_claude(python)\n",
    "    elif model==\"Gemini\":\n",
    "        result = stream_google(python)\n",
    "    else:\n",
    "        raise ValueError(\"Unknown model\")\n",
    "    for stream_so_far in result:\n",
    "        yield stream_so_far        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "43a6b5f5-5d7c-4511-9d0c-21640070b3cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def execute_python(code):\n",
    "        try:\n",
    "            output = io.StringIO()\n",
    "            sys.stdout = output\n",
    "            exec(code)\n",
    "        finally:\n",
    "            sys.stdout = sys.__stdout__\n",
    "        return output.getvalue()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "f35b0602-84f9-4ed6-aa35-87be4290ed24",
   "metadata": {},
   "outputs": [],
   "source": [
    "css = \"\"\"\n",
    ".python {background-color: #306998;}\n",
    ".cpp {background-color: #050;}\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "62488014-d34c-4de8-ba72-9516e05e9dde",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "* Running on local URL:  http://127.0.0.1:7860\n",
      "\n",
      "To create a public link, set `share=True` in `launch()`.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": []
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with gr.Blocks(css=css) as ui:\n",
    "    gr.Markdown(\"## Convert code from Python to C++\")\n",
    "    with gr.Row():\n",
    "        python = gr.Textbox(label=\"Python code:\", value=pi, lines=10)\n",
    "        commented_python = gr.Textbox(label=\"Commented code:\", lines=10)\n",
    "    with gr.Row():\n",
    "        model = gr.Dropdown([\"GPT\", \"Claude\", \"Gemini\"], label=\"Select model\", value=\"GPT\")\n",
    "    with gr.Row():\n",
    "        comment = gr.Button(\"Comment code\")\n",
    "    with gr.Row():\n",
    "        python_run = gr.Button(\"Check Commented Python\")\n",
    "    with gr.Row():\n",
    "        python_out = gr.TextArea(label=\"Python result:\", elem_classes=[\"python\"])\n",
    "\n",
    "    comment.click(optimize, inputs=[python, model], outputs=[commented_python])\n",
    "    python_run.click(execute_python, inputs=[python], outputs=[python_out])\n",
    "\n",
    "ui.launch(inbrowser=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b084760b-c327-4fe7-9b7c-a01b1a383dc3",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}