Update app.py
Browse files
app.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1 |
-
import
|
2 |
import urllib3
|
3 |
from bs4 import BeautifulSoup
|
4 |
from urllib.parse import urljoin
|
5 |
-
import gradio as gr
|
6 |
|
7 |
# --- Custom HTTP Session and Response Classes ---
|
8 |
|
@@ -63,42 +62,28 @@ def extract_images(soup, base_url):
|
|
63 |
def format_detailed_output(structured_data):
|
64 |
"""Formats the structured data into a Markdown string."""
|
65 |
result = "### Structured Page Content\n\n"
|
66 |
-
|
67 |
-
# Texts
|
68 |
-
result += "**Texts:**\n"
|
69 |
-
if structured_data["Texts"]:
|
70 |
-
result += " ".join(structured_data["Texts"]) + "\n\n"
|
71 |
-
else:
|
72 |
-
result += "No textual content found.\n\n"
|
73 |
-
|
74 |
-
# Links
|
75 |
result += "**Links:**\n"
|
76 |
if structured_data["Links"]:
|
77 |
-
result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n
|
78 |
else:
|
79 |
-
result += "No links found.\n
|
80 |
-
|
81 |
-
# Images
|
82 |
result += "**Images:**\n"
|
83 |
if structured_data["Images"]:
|
84 |
-
result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n
|
85 |
else:
|
86 |
-
result += "No images found.\n
|
87 |
-
|
88 |
return result
|
89 |
|
90 |
-
# --- Web Page Processing
|
91 |
|
92 |
def download_and_process_web_page(url):
|
93 |
-
"""Downloads a web page
|
94 |
if not url.startswith("http://") and not url.startswith("https://"):
|
95 |
url = "http://" + url # Prepend "http://" if not present
|
96 |
|
97 |
try:
|
98 |
response = get(url)
|
99 |
-
if response.status_code != 200:
|
100 |
-
return f"Error: Received status code {response.status_code}"
|
101 |
-
|
102 |
soup = response.soup()
|
103 |
structured_data = {
|
104 |
"Texts": extract_texts(soup),
|
@@ -122,8 +107,8 @@ iface = gr.Interface(
|
|
122 |
outputs=[
|
123 |
gr.Markdown(label="Web Page Content"),
|
124 |
],
|
125 |
-
title="
|
126 |
-
description="Enter the URL of a web page. The tool will extract and
|
127 |
concurrency_limit=None,
|
128 |
api_name="main"
|
129 |
)
|
|
|
1 |
+
import gradio as gr
|
2 |
import urllib3
|
3 |
from bs4 import BeautifulSoup
|
4 |
from urllib.parse import urljoin
|
|
|
5 |
|
6 |
# --- Custom HTTP Session and Response Classes ---
|
7 |
|
|
|
62 |
def format_detailed_output(structured_data):
|
63 |
"""Formats the structured data into a Markdown string."""
|
64 |
result = "### Structured Page Content\n\n"
|
65 |
+
result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
result += "**Links:**\n"
|
67 |
if structured_data["Links"]:
|
68 |
+
result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n"
|
69 |
else:
|
70 |
+
result += "No links found.\n"
|
|
|
|
|
71 |
result += "**Images:**\n"
|
72 |
if structured_data["Images"]:
|
73 |
+
result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n"
|
74 |
else:
|
75 |
+
result += "No images found.\n"
|
|
|
76 |
return result
|
77 |
|
78 |
+
# --- Web Page Processing Function ---
|
79 |
|
80 |
def download_and_process_web_page(url):
|
81 |
+
"""Downloads a web page and returns the structured content."""
|
82 |
if not url.startswith("http://") and not url.startswith("https://"):
|
83 |
url = "http://" + url # Prepend "http://" if not present
|
84 |
|
85 |
try:
|
86 |
response = get(url)
|
|
|
|
|
|
|
87 |
soup = response.soup()
|
88 |
structured_data = {
|
89 |
"Texts": extract_texts(soup),
|
|
|
107 |
outputs=[
|
108 |
gr.Markdown(label="Web Page Content"),
|
109 |
],
|
110 |
+
title="Web Page Processor for Hugging Face Chat Tools",
|
111 |
+
description="Enter the URL of a web page. The tool will extract and display the structured content of the page, including text, links, and images. This tool is designed for use with Hugging Face Chat Tools.",
|
112 |
concurrency_limit=None,
|
113 |
api_name="main"
|
114 |
)
|