Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,93 +1,112 @@
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import openai
|
4 |
-
|
5 |
-
import json
|
6 |
import re
|
|
|
|
|
7 |
from transformers import GPT2Tokenizer
|
8 |
-
import requests
|
9 |
|
|
|
|
|
10 |
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
try:
|
14 |
headers = {'User-Agent': 'Chrome/83.0.4103.106'}
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
page = Article('')
|
19 |
-
page.set_html(
|
20 |
page.parse()
|
21 |
-
|
22 |
except Exception as e:
|
23 |
-
return "", f"---
|
24 |
-
|
|
|
25 |
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
26 |
sentences = page.text.split('.')
|
27 |
-
|
28 |
-
|
29 |
-
page_text = ""
|
30 |
-
|
31 |
for sentence in sentences:
|
32 |
-
|
33 |
-
|
34 |
-
# Trim text to a maximum of 1800 tokens
|
35 |
-
if len(tokens) > 1800:
|
36 |
break
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
|
42 |
-
|
|
|
|
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
58 |
|
59 |
-
# clean the response text
|
60 |
-
response_text = re.sub(r'\s+', ' ', response_text)
|
61 |
-
response_text = response_text.strip()
|
62 |
-
return page.text, response_text, total_tokens
|
63 |
-
except Exception as e:
|
64 |
-
return page.text, f"--- An error occurred while processing the request: {e} ---", num_tokens
|
65 |
-
return page.text, "--- Min number of tokens:", num_tokens
|
66 |
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
gr.Textbox(lines=1, placeholder="Enter your
|
73 |
-
gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
],
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import openai
|
4 |
+
import requests
|
|
|
5 |
import re
|
6 |
+
from newspaper import Article
|
7 |
+
import transformers
|
8 |
from transformers import GPT2Tokenizer
|
|
|
9 |
|
10 |
+
# --- Silence Transformers backend warnings (since you only need the tokenizer) ---
|
11 |
+
transformers.logging.set_verbosity_error()
|
12 |
|
13 |
+
def text_prompt(request: str, page_url: str, api_key: str, temp: float):
|
14 |
+
"""
|
15 |
+
Fetches the article at page_url, extracts text with newspaper3k,
|
16 |
+
trims it to ~1800 GPT-2 tokens, and sends it along with 'request'
|
17 |
+
to OpenAI's Completion API.
|
18 |
+
Returns: (full_input_text, ai_response, total_tokens_used_or_error)
|
19 |
+
"""
|
20 |
+
# 1) Fetch and parse the page
|
21 |
try:
|
22 |
headers = {'User-Agent': 'Chrome/83.0.4103.106'}
|
23 |
+
resp = requests.get(page_url, headers=headers, timeout=10)
|
24 |
+
resp.raise_for_status()
|
|
|
25 |
page = Article('')
|
26 |
+
page.set_html(resp.text)
|
27 |
page.parse()
|
|
|
28 |
except Exception as e:
|
29 |
+
return "", f"--- Error fetching/parsing URL: {e} ---", ""
|
30 |
+
|
31 |
+
# 2) Tokenize & truncate to ~1800 tokens
|
32 |
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
33 |
sentences = page.text.split('.')
|
34 |
+
tokens_accum = []
|
35 |
+
truncated_text = ""
|
|
|
|
|
36 |
for sentence in sentences:
|
37 |
+
toks = tokenizer.tokenize(sentence + ".")
|
38 |
+
if len(tokens_accum) + len(toks) > 1800:
|
|
|
|
|
39 |
break
|
40 |
+
tokens_accum.extend(toks)
|
41 |
+
truncated_text += sentence + ". "
|
42 |
+
truncated_text = truncated_text.strip()
|
43 |
+
num_input_tokens = len(tokens_accum)
|
44 |
|
45 |
+
# 3) If there's enough content, call OpenAI
|
46 |
+
if num_input_tokens < 10:
|
47 |
+
return page.text, f"--- Not enough text to summarize ({num_input_tokens} tokens) ---", num_input_tokens
|
48 |
|
49 |
+
openai.api_key = api_key
|
50 |
+
try:
|
51 |
+
completion = openai.Completion.create(
|
52 |
+
engine="text-davinci-003",
|
53 |
+
prompt=request + "\n\n>>\n" + truncated_text + "\n<<",
|
54 |
+
max_tokens=2048,
|
55 |
+
temperature=temp,
|
56 |
+
top_p=0.9,
|
57 |
+
)
|
58 |
+
ai_text = completion.choices[0].text.strip()
|
59 |
+
total_tokens = completion.usage.total_tokens
|
60 |
+
# Collapse whitespace
|
61 |
+
ai_text = re.sub(r'\s+', ' ', ai_text)
|
62 |
+
return page.text, ai_text, total_tokens
|
63 |
+
except Exception as e:
|
64 |
+
return page.text, f"--- OpenAI API error: {e} ---", num_input_tokens
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
+
if __name__ == "__main__":
|
68 |
+
# Build the Gradio interface
|
69 |
+
iface = gr.Interface(
|
70 |
+
fn=text_prompt,
|
71 |
+
inputs=[
|
72 |
+
gr.Textbox(lines=1, placeholder="Enter your prompt here...", label="Prompt:"),
|
73 |
+
gr.Textbox(lines=1, placeholder="Enter the URL here...", label="URL to parse:"),
|
74 |
+
gr.Textbox(lines=1, placeholder="Enter your API key here...", label="API-Key:", type="password"),
|
75 |
+
gr.Slider(0.0, 1.0, value=0.3, label="Temperature:")
|
76 |
+
],
|
77 |
+
outputs=[
|
78 |
+
gr.Textbox(label="Input Text:"),
|
79 |
+
gr.Textbox(label="AI Output:"),
|
80 |
+
gr.Textbox(label="Total Tokens:")
|
81 |
+
],
|
82 |
+
examples=[
|
83 |
+
[
|
84 |
+
"Summarize the following text as a list:",
|
85 |
+
"https://blog.google/outreach-initiatives/google-org/our-commitment-on-using-ai-to-accelerate-progress-on-global-development-goals/",
|
86 |
+
"", 0.3
|
87 |
],
|
88 |
+
[
|
89 |
+
"Generate a summary of the following text. Give me an overview of main business impact from the text following this template:\n- Summary:\n- Business Impact:\n- Companies:",
|
90 |
+
"https://ai.googleblog.com/2019/10/quantum-supremacy-using-programmable.html",
|
91 |
+
"", 0.7
|
92 |
+
],
|
93 |
+
[
|
94 |
+
"Generate the next insights based on the following text. Indicates N/A if the information is not available in the text.\n- Summary:\n- Acquisition Price:\n- Why is this important for the acquirer:\n- Business Line for the acquirer:\n- Tech Focus for the acquired (list):",
|
95 |
+
"https://techcrunch.com/2022/09/28/eqt-acquires-billtrust-a-company-automating-the-invoice-to-cash-process-for-1-7b/",
|
96 |
+
"", 0.3
|
97 |
+
]
|
98 |
+
],
|
99 |
+
title="ChatGPT / GPT-3 Info Extraction from URL",
|
100 |
+
description=(
|
101 |
+
"Fetches text from a URL using newspaper3k, trims it to ~1800 GPT-2 tokens, "
|
102 |
+
"then queries OpenAI's text-davinci-003. Enter your prompt, URL, API key, and temperature."
|
103 |
+
)
|
104 |
+
)
|
105 |
|
106 |
+
# Launch Gradio with queuing (default concurrency)
|
107 |
+
try:
|
108 |
+
iface.queue()
|
109 |
+
iface.launch()
|
110 |
+
except Exception as e:
|
111 |
+
# Print the error so it shows up in your logs/terminal
|
112 |
+
print("Failed to launch Gradio interface:", e)
|