Spaces:

dromerosm
/

gpt-info-extraction

Sleeping

App Files Files Community

dromerosm commited on 6 days ago

Commit

3eaa349

verified ·

1 Parent(s): 5fe0442

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -71

app.py CHANGED Viewed

@@ -1,93 +1,112 @@
 import gradio as gr
 import os
 import openai
-from newspaper import Article
-import json
 import re
 from transformers import GPT2Tokenizer
-import requests
-# define the text summarizer function
-def text_prompt(request, page_url, contraseña, temp):
     try:
         headers = {'User-Agent': 'Chrome/83.0.4103.106'}
-        response = requests.get(page_url, headers=headers)
-        html = response.text
         page = Article('')
-        page.set_html(html)
         page.parse()
     except Exception as e:
-        return "", f"--- An error occurred while processing the URL: {e} ---", ""
     tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
     sentences = page.text.split('.')
-    tokens = []
-    page_text = ""
     for sentence in sentences:
-        tokens.extend(tokenizer.tokenize(sentence))
-        # Trim text to a maximum of 1800 tokens
-        if len(tokens) > 1800:
             break
-        page_text += sentence + ". "
-    # Delete the last space
-    page_text = page_text.strip()
-    num_tokens = len(tokens)
-    if num_tokens > 10:
-        openai.api_key = contraseña
-        # get the response from openai API
-        try:
-            response = openai.Completion.create(
-                engine="text-davinci-003",
-                prompt=request + "\n\n" + ">>\n" + page_text + "\n<<",
-                max_tokens=2048,
-                temperature=temp,
-                top_p=0.9,
-            )
-            # get the response text
-            response_text = response.choices[0].text
-            total_tokens = response["usage"]["total_tokens"]
-            # clean the response text
-            response_text = re.sub(r'\s+', ' ', response_text)
-            response_text = response_text.strip()
-            return page.text, response_text, total_tokens
-        except Exception as e:
-            return page.text, f"--- An error occurred while processing the request: {e} ---", num_tokens
-    return page.text, "--- Min number of tokens:", num_tokens
-# define the gradio interface
-iface = gr.Interface(
-    fn=text_prompt,
-    inputs=[gr.Textbox(lines=1, placeholder="Enter your prompt here...", label="Prompt:", type="text"),
-            gr.Textbox(lines=1, placeholder="Enter the URL here...", label="URL to parse:", type="text"),
-            gr.Textbox(lines=1, placeholder="Enter your API-key here...", label="API-Key:", type="password"),
-            gr.Slider(0.0,1.0, value=0.3, label="Temperature:")
             ],
-    outputs=[gr.Textbox(label="Input:"), gr.Textbox(label="Output:"), gr.Textbox(label="Total Tokens:")],
-    examples=[["Summarize the following text as a list:","https://blog.google/outreach-initiatives/google-org/our-commitment-on-using-ai-to-accelerate-progress-on-global-development-goals/","",0.3],
-            ["Generate a summary of the following text. Give me an overview of main business impact from the text following this template:\n- Summary:\n- Business Impact:\n- Companies:", "https://ai.googleblog.com/2019/10/quantum-supremacy-using-programmable.html","",0.7],
-            ["Generate the next insights based on the following text. Indicates N/A if the information is not available in the text.\n- Summary:\n- Acquisition Price:\n- Why is this important for the acquirer:\n- Business Line for the acquirer:\n- Tech Focus for the acquired (list):","https://techcrunch.com/2022/09/28/eqt-acquires-billtrust-a-company-automating-the-invoice-to-cash-process-for-1-7b/","",0.3]
-    ],
-    title="ChatGPT / GPT-3 info extraction from URL",
-    description="This tool allows querying the text retrieved from the URL with newspaper3k lib and using OpenAI's [text-davinci-003] engine.\nThe URL text can be referenced in the prompt as \"following text\".\nA GPT2 tokenizer is included to ensure that the 1.800 token limit for OpenAI queries is not exceeded. Provide a prompt with your request, the url for text retrieval, your api-key and temperature to process the text."
-)
-# error capturing in integration as a component
-error_message = ""
-try:
-    iface.queue(concurrency_count=20)
-    iface.launch()
-except Exception as e:
-    error_message = "An error occurred: " + str(e)
-    iface.outputs[1].value = error_message

 import gradio as gr
 import os
 import openai
+import requests
 import re
+from newspaper import Article
+import transformers
 from transformers import GPT2Tokenizer
+# --- Silence Transformers backend warnings (since you only need the tokenizer) ---
+transformers.logging.set_verbosity_error()
+def text_prompt(request: str, page_url: str, api_key: str, temp: float):
+    """
+    Fetches the article at page_url, extracts text with newspaper3k,
+    trims it to ~1800 GPT-2 tokens, and sends it along with 'request'
+    to OpenAI's Completion API.
+    Returns: (full_input_text, ai_response, total_tokens_used_or_error)
+    """
+    # 1) Fetch and parse the page
     try:
         headers = {'User-Agent': 'Chrome/83.0.4103.106'}
+        resp = requests.get(page_url, headers=headers, timeout=10)
+        resp.raise_for_status()
         page = Article('')
+        page.set_html(resp.text)
         page.parse()
     except Exception as e:
+        return "", f"--- Error fetching/parsing URL: {e} ---", ""
+    # 2) Tokenize & truncate to ~1800 tokens
     tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
     sentences = page.text.split('.')
+    tokens_accum = []
+    truncated_text = ""
     for sentence in sentences:
+        toks = tokenizer.tokenize(sentence + ".")
+        if len(tokens_accum) + len(toks) > 1800:
             break
+        tokens_accum.extend(toks)
+        truncated_text += sentence + ". "
+    truncated_text = truncated_text.strip()
+    num_input_tokens = len(tokens_accum)
+    # 3) If there's enough content, call OpenAI
+    if num_input_tokens < 10:
+        return page.text, f"--- Not enough text to summarize ({num_input_tokens} tokens) ---", num_input_tokens
+    openai.api_key = api_key
+    try:
+        completion = openai.Completion.create(
+            engine="text-davinci-003",
+            prompt=request + "\n\n>>\n" + truncated_text + "\n<<",
+            max_tokens=2048,
+            temperature=temp,
+            top_p=0.9,
+        )
+        ai_text = completion.choices[0].text.strip()
+        total_tokens = completion.usage.total_tokens
+        # Collapse whitespace
+        ai_text = re.sub(r'\s+', ' ', ai_text)
+        return page.text, ai_text, total_tokens
+    except Exception as e:
+        return page.text, f"--- OpenAI API error: {e} ---", num_input_tokens
+if __name__ == "__main__":
+    # Build the Gradio interface
+    iface = gr.Interface(
+        fn=text_prompt,
+        inputs=[
+            gr.Textbox(lines=1, placeholder="Enter your prompt here...", label="Prompt:"),
+            gr.Textbox(lines=1, placeholder="Enter the URL here...", label="URL to parse:"),
+            gr.Textbox(lines=1, placeholder="Enter your API key here...", label="API-Key:", type="password"),
+            gr.Slider(0.0, 1.0, value=0.3, label="Temperature:")
+        ],
+        outputs=[
+            gr.Textbox(label="Input Text:"),
+            gr.Textbox(label="AI Output:"),
+            gr.Textbox(label="Total Tokens:")
+        ],
+        examples=[
+            [
+                "Summarize the following text as a list:",
+                "https://blog.google/outreach-initiatives/google-org/our-commitment-on-using-ai-to-accelerate-progress-on-global-development-goals/",
+                "", 0.3
             ],
+            [
+                "Generate a summary of the following text. Give me an overview of main business impact from the text following this template:\n- Summary:\n- Business Impact:\n- Companies:",
+                "https://ai.googleblog.com/2019/10/quantum-supremacy-using-programmable.html",
+                "", 0.7
+            ],
+            [
+                "Generate the next insights based on the following text. Indicates N/A if the information is not available in the text.\n- Summary:\n- Acquisition Price:\n- Why is this important for the acquirer:\n- Business Line for the acquirer:\n- Tech Focus for the acquired (list):",
+                "https://techcrunch.com/2022/09/28/eqt-acquires-billtrust-a-company-automating-the-invoice-to-cash-process-for-1-7b/",
+                "", 0.3
+            ]
+        ],
+        title="ChatGPT / GPT-3 Info Extraction from URL",
+        description=(
+            "Fetches text from a URL using newspaper3k, trims it to ~1800 GPT-2 tokens, "
+            "then queries OpenAI's text-davinci-003. Enter your prompt, URL, API key, and temperature."
+        )
+    )
+    # Launch Gradio with queuing (default concurrency)
+    try:
+        iface.queue()
+        iface.launch()
+    except Exception as e:
+        # Print the error so it shows up in your logs/terminal
+        print("Failed to launch Gradio interface:", e)