Update app.py
Browse files
app.py
CHANGED
@@ -40,16 +40,6 @@ def extract_links(soup, base_url):
|
|
40 |
links.append({"Text": link_text, "URL": full_url})
|
41 |
return links
|
42 |
|
43 |
-
def extract_links(soup, base_url):
|
44 |
-
"""Extracts all valid links from the soup."""
|
45 |
-
links = []
|
46 |
-
for link in soup.find_all('a', href=True):
|
47 |
-
href = link['href']
|
48 |
-
full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
|
49 |
-
link_text = link.get_text(strip=True) or "No Text"
|
50 |
-
links.append({"Text": link_text, "URL": full_url})
|
51 |
-
return links
|
52 |
-
|
53 |
def extract_images(soup, base_url):
|
54 |
"""Extracts all valid image URLs and their alt text from the soup."""
|
55 |
images = []
|
@@ -60,14 +50,9 @@ def extract_images(soup, base_url):
|
|
60 |
images.append({"Alt Text": alt_text, "Image URL": full_img_url})
|
61 |
return images
|
62 |
|
63 |
-
def extract_page_title(soup):
|
64 |
-
"""Extracts the page title from the soup."""
|
65 |
-
title_tag = soup.find('title')
|
66 |
-
return title_tag.get_text(strip=True) if title_tag else "No Title Found"
|
67 |
-
|
68 |
def format_detailed_output(structured_data):
|
69 |
"""Formats the structured data into a Markdown string."""
|
70 |
-
result =
|
71 |
result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
|
72 |
result += "**Links:**\n"
|
73 |
if structured_data["Links"]:
|
@@ -91,7 +76,6 @@ def download_and_process_web_page(url):
|
|
91 |
response = get(url)
|
92 |
soup = response.soup()
|
93 |
structured_data = {
|
94 |
-
"Page Title": extract_page_title(soup),
|
95 |
"Texts": extract_texts(soup),
|
96 |
"Links": extract_links(soup, url),
|
97 |
"Images": extract_images(soup, url)
|
@@ -109,7 +93,7 @@ iface = gr.Interface(
|
|
109 |
inputs=gr.Textbox(lines=1, placeholder="Enter URL of the web page"),
|
110 |
outputs=gr.Markdown(label="Web Page Content"),
|
111 |
title="Web Page Processor for Hugging Face Chat Tools",
|
112 |
-
description="Enter the URL of a web page. The tool will extract and display the structured content of the page, including
|
113 |
share=False # Set share to False to remove the "Share via link" button
|
114 |
)
|
115 |
|
|
|
40 |
links.append({"Text": link_text, "URL": full_url})
|
41 |
return links
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
def extract_images(soup, base_url):
|
44 |
"""Extracts all valid image URLs and their alt text from the soup."""
|
45 |
images = []
|
|
|
50 |
images.append({"Alt Text": alt_text, "Image URL": full_img_url})
|
51 |
return images
|
52 |
|
|
|
|
|
|
|
|
|
|
|
53 |
def format_detailed_output(structured_data):
|
54 |
"""Formats the structured data into a Markdown string."""
|
55 |
+
result = "### Structured Page Content\n\n"
|
56 |
result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
|
57 |
result += "**Links:**\n"
|
58 |
if structured_data["Links"]:
|
|
|
76 |
response = get(url)
|
77 |
soup = response.soup()
|
78 |
structured_data = {
|
|
|
79 |
"Texts": extract_texts(soup),
|
80 |
"Links": extract_links(soup, url),
|
81 |
"Images": extract_images(soup, url)
|
|
|
93 |
inputs=gr.Textbox(lines=1, placeholder="Enter URL of the web page"),
|
94 |
outputs=gr.Markdown(label="Web Page Content"),
|
95 |
title="Web Page Processor for Hugging Face Chat Tools",
|
96 |
+
description="Enter the URL of a web page. The tool will extract and display the structured content of the page, including text, links, and images. This tool is designed for use with Hugging Face Chat Tools.",
|
97 |
share=False # Set share to False to remove the "Share via link" button
|
98 |
)
|
99 |
|