KSh100 commited on
Commit
8ac60f7
·
verified ·
1 Parent(s): bcd351e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -18
app.py CHANGED
@@ -40,16 +40,6 @@ def extract_links(soup, base_url):
40
  links.append({"Text": link_text, "URL": full_url})
41
  return links
42
 
43
- def extract_links(soup, base_url):
44
- """Extracts all valid links from the soup."""
45
- links = []
46
- for link in soup.find_all('a', href=True):
47
- href = link['href']
48
- full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
49
- link_text = link.get_text(strip=True) or "No Text"
50
- links.append({"Text": link_text, "URL": full_url})
51
- return links
52
-
53
  def extract_images(soup, base_url):
54
  """Extracts all valid image URLs and their alt text from the soup."""
55
  images = []
@@ -60,14 +50,9 @@ def extract_images(soup, base_url):
60
  images.append({"Alt Text": alt_text, "Image URL": full_img_url})
61
  return images
62
 
63
- def extract_page_title(soup):
64
- """Extracts the page title from the soup."""
65
- title_tag = soup.find('title')
66
- return title_tag.get_text(strip=True) if title_tag else "No Title Found"
67
-
68
  def format_detailed_output(structured_data):
69
  """Formats the structured data into a Markdown string."""
70
- result = f"### Page Title: {structured_data['Page Title']}\n\n"
71
  result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
72
  result += "**Links:**\n"
73
  if structured_data["Links"]:
@@ -91,7 +76,6 @@ def download_and_process_web_page(url):
91
  response = get(url)
92
  soup = response.soup()
93
  structured_data = {
94
- "Page Title": extract_page_title(soup),
95
  "Texts": extract_texts(soup),
96
  "Links": extract_links(soup, url),
97
  "Images": extract_images(soup, url)
@@ -109,7 +93,7 @@ iface = gr.Interface(
109
  inputs=gr.Textbox(lines=1, placeholder="Enter URL of the web page"),
110
  outputs=gr.Markdown(label="Web Page Content"),
111
  title="Web Page Processor for Hugging Face Chat Tools",
112
- description="Enter the URL of a web page. The tool will extract and display the structured content of the page, including the title, text, links, and images. This tool is designed for use with Hugging Face Chat Tools.",
113
  share=False # Set share to False to remove the "Share via link" button
114
  )
115
 
 
40
  links.append({"Text": link_text, "URL": full_url})
41
  return links
42
 
 
 
 
 
 
 
 
 
 
 
43
  def extract_images(soup, base_url):
44
  """Extracts all valid image URLs and their alt text from the soup."""
45
  images = []
 
50
  images.append({"Alt Text": alt_text, "Image URL": full_img_url})
51
  return images
52
 
 
 
 
 
 
53
  def format_detailed_output(structured_data):
54
  """Formats the structured data into a Markdown string."""
55
+ result = "### Structured Page Content\n\n"
56
  result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
57
  result += "**Links:**\n"
58
  if structured_data["Links"]:
 
76
  response = get(url)
77
  soup = response.soup()
78
  structured_data = {
 
79
  "Texts": extract_texts(soup),
80
  "Links": extract_links(soup, url),
81
  "Images": extract_images(soup, url)
 
93
  inputs=gr.Textbox(lines=1, placeholder="Enter URL of the web page"),
94
  outputs=gr.Markdown(label="Web Page Content"),
95
  title="Web Page Processor for Hugging Face Chat Tools",
96
+ description="Enter the URL of a web page. The tool will extract and display the structured content of the page, including text, links, and images. This tool is designed for use with Hugging Face Chat Tools.",
97
  share=False # Set share to False to remove the "Share via link" button
98
  )
99