Spaces:

dromerosm
/

gpt-info-extraction

Sleeping

App Files Files Community

dromerosm commited on 6 days ago

Commit

14df869

verified ·

1 Parent(s): 3eaa349

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -59

app.py CHANGED Viewed

@@ -7,17 +7,11 @@ from newspaper import Article
 import transformers
 from transformers import GPT2Tokenizer
-# --- Silence Transformers backend warnings (since you only need the tokenizer) ---
 transformers.logging.set_verbosity_error()
 def text_prompt(request: str, page_url: str, api_key: str, temp: float):
-    """
-    Fetches the article at page_url, extracts text with newspaper3k,
-    trims it to ~1800 GPT-2 tokens, and sends it along with 'request'
-    to OpenAI's Completion API.
-    Returns: (full_input_text, ai_response, total_tokens_used_or_error)
-    """
-    # 1) Fetch and parse the page
     try:
         headers = {'User-Agent': 'Chrome/83.0.4103.106'}
         resp = requests.get(page_url, headers=headers, timeout=10)
@@ -26,87 +20,63 @@ def text_prompt(request: str, page_url: str, api_key: str, temp: float):
         page.set_html(resp.text)
         page.parse()
     except Exception as e:
-        return "", f"--- Error fetching/parsing URL: {e} ---", ""
-    # 2) Tokenize & truncate to ~1800 tokens
     tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
     sentences = page.text.split('.')
-    tokens_accum = []
-    truncated_text = ""
-    for sentence in sentences:
-        toks = tokenizer.tokenize(sentence + ".")
         if len(tokens_accum) + len(toks) > 1800:
             break
-        tokens_accum.extend(toks)
-        truncated_text += sentence + ". "
-    truncated_text = truncated_text.strip()
     num_input_tokens = len(tokens_accum)
-    # 3) If there's enough content, call OpenAI
     if num_input_tokens < 10:
-        return page.text, f"--- Not enough text to summarize ({num_input_tokens} tokens) ---", num_input_tokens
     openai.api_key = api_key
     try:
-        completion = openai.Completion.create(
-            engine="text-davinci-003",
-            prompt=request + "\n\n>>\n" + truncated_text + "\n<<",
-            max_tokens=2048,
             temperature=temp,
             top_p=0.9,
         )
-        ai_text = completion.choices[0].text.strip()
-        total_tokens = completion.usage.total_tokens
-        # Collapse whitespace
-        ai_text = re.sub(r'\s+', ' ', ai_text)
         return page.text, ai_text, total_tokens
-    except Exception as e:
-        return page.text, f"--- OpenAI API error: {e} ---", num_input_tokens
 if __name__ == "__main__":
-    # Build the Gradio interface
     iface = gr.Interface(
         fn=text_prompt,
         inputs=[
-            gr.Textbox(lines=1, placeholder="Enter your prompt here...", label="Prompt:"),
-            gr.Textbox(lines=1, placeholder="Enter the URL here...", label="URL to parse:"),
-            gr.Textbox(lines=1, placeholder="Enter your API key here...", label="API-Key:", type="password"),
-            gr.Slider(0.0, 1.0, value=0.3, label="Temperature:")
         ],
         outputs=[
             gr.Textbox(label="Input Text:"),
             gr.Textbox(label="AI Output:"),
             gr.Textbox(label="Total Tokens:")
         ],
-        examples=[
-            [
-                "Summarize the following text as a list:",
-                "https://blog.google/outreach-initiatives/google-org/our-commitment-on-using-ai-to-accelerate-progress-on-global-development-goals/",
-                "", 0.3
-            ],
-            [
-                "Generate a summary of the following text. Give me an overview of main business impact from the text following this template:\n- Summary:\n- Business Impact:\n- Companies:",
-                "https://ai.googleblog.com/2019/10/quantum-supremacy-using-programmable.html",
-                "", 0.7
-            ],
-            [
-                "Generate the next insights based on the following text. Indicates N/A if the information is not available in the text.\n- Summary:\n- Acquisition Price:\n- Why is this important for the acquirer:\n- Business Line for the acquirer:\n- Tech Focus for the acquired (list):",
-                "https://techcrunch.com/2022/09/28/eqt-acquires-billtrust-a-company-automating-the-invoice-to-cash-process-for-1-7b/",
-                "", 0.3
-            ]
-        ],
-        title="ChatGPT / GPT-3 Info Extraction from URL",
-        description=(
-            "Fetches text from a URL using newspaper3k, trims it to ~1800 GPT-2 tokens, "
-            "then queries OpenAI's text-davinci-003. Enter your prompt, URL, API key, and temperature."
-        )
     )
-    # Launch Gradio with queuing (default concurrency)
     try:
         iface.queue()
         iface.launch()
     except Exception as e:
-        # Print the error so it shows up in your logs/terminal
-        print("Failed to launch Gradio interface:", e)

 import transformers
 from transformers import GPT2Tokenizer
+# Silence transformers backend warnings
 transformers.logging.set_verbosity_error()
 def text_prompt(request: str, page_url: str, api_key: str, temp: float):
+    # Fetch & parse
     try:
         headers = {'User-Agent': 'Chrome/83.0.4103.106'}
         resp = requests.get(page_url, headers=headers, timeout=10)
         page.set_html(resp.text)
         page.parse()
     except Exception as e:
+        return "", f"Error fetching URL: {e}", ""
+    # Tokenize & truncate to ~1800 GPT-2 tokens
     tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
     sentences = page.text.split('.')
+    tokens_accum, truncated_text = [], ""
+    for sent in sentences:
+        toks = tokenizer.tokenize(sent + ".")
         if len(tokens_accum) + len(toks) > 1800:
             break
+        tokens_accum += toks
+        truncated_text += sent + ". "
     num_input_tokens = len(tokens_accum)
     if num_input_tokens < 10:
+        return page.text, f"Not enough text ({num_input_tokens} tokens)", num_input_tokens
+    # Call GPT-4o mini via ChatCompletion
     openai.api_key = api_key
     try:
+        chat_resp = openai.ChatCompletion.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user",   "content": request + "\n\n>>\n" + truncated_text + "\n<<"}
+            ],
             temperature=temp,
+            max_tokens=2048,
             top_p=0.9,
         )
+        ai_text      = re.sub(r'\s+', ' ', chat_resp.choices[0].message.content).strip()
+        total_tokens = chat_resp.usage.total_tokens
         return page.text, ai_text, total_tokens
+    except Exception as e:
+        return page.text, f"OpenAI API error: {e}", num_input_tokens
 if __name__ == "__main__":
     iface = gr.Interface(
         fn=text_prompt,
         inputs=[
+            gr.Textbox(label="Prompt:"),
+            gr.Textbox(label="URL to parse:"),
+            gr.Textbox(label="API-Key:", type="password"),
+            gr.Slider(0.0,1.0,value=0.3, label="Temperature:")
         ],
         outputs=[
             gr.Textbox(label="Input Text:"),
             gr.Textbox(label="AI Output:"),
             gr.Textbox(label="Total Tokens:")
         ],
+        title="GPT-4o-mini URL Summarizer",
+        description="Uses GPT-4o-mini via ChatCompletion to summarize webpage text."
     )
     try:
         iface.queue()
         iface.launch()
     except Exception as e:
+        print("Failed to launch:", e)