Spaces:

KSh100
/

websearch

Sleeping

App Files Files Community

KSh100 commited on Mar 26

Commit

900966d

verified ·

1 Parent(s): 07d7acc

Create app.py

Browse files

Files changed (1) hide show

app.py +125 -0

app.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import re
+import urllib3
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+import gradio as gr
+# Constants
+CHUNK_SIZE = 32000
+# --- Custom HTTP Session and Response Classes ---
+class CustomSession:
+    def __init__(self):
+        self.pool_manager = urllib3.PoolManager()
+    def get(self, url):
+        response = self.pool_manager.request('GET', url)
+        return CustomResponse(response)
+class CustomResponse:
+    def __init__(self, response):
+        self.status_code = response.status
+        self.headers = response.headers
+        self.content = response.data
+    def soup(self):
+        return BeautifulSoup(self.content, 'lxml')
+    def clean_text(self):
+        soup = self.soup()
+        cleaned_text = soup.get_text().replace('\n', ' ').replace('\r', ' ').replace('  ', ' ')
+        while '  ' in cleaned_text:
+            cleaned_text = cleaned_text.replace('  ', ' ')
+        return cleaned_text.strip()
+def get(url):
+    session = CustomSession()
+    return session.get(url)
+# --- Utility Functions ---
+def extract_texts(soup):
+    """Extracts all text content from the soup."""
+    return [text for text in soup.stripped_strings]
+def extract_links(soup, base_url):
+    """Extracts all valid links from the soup."""
+    links = []
+    for link in soup.find_all('a', href=True):
+        href = link['href']
+        full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
+        link_text = link.get_text(strip=True) or "No Text"
+        links.append({"Text": link_text, "URL": full_url})
+    return links
+def extract_images(soup, base_url):
+    """Extracts all valid image URLs and their alt text from the soup."""
+    images = []
+    for img in soup.find_all('img', src=True):
+        img_url = img['src']
+        full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url
+        alt_text = img.get('alt', 'No Alt Text')
+        images.append({"Alt Text": alt_text, "Image URL": full_img_url})
+    return images
+def format_detailed_output(structured_data):
+    """Formats the structured data into a Markdown string."""
+    result = "### Structured Page Content\n\n"
+    result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
+    result += "**Links:**\n"
+    if structured_data["Links"]:
+        result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n"
+    else:
+        result += "No links found.\n"
+    result += "**Images:**\n"
+    if structured_data["Images"]:
+        result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n"
+    else:
+        result += "No images found.\n"
+    return result
+# --- Web Page Processing Functions ---
+def download_and_process_web_page(url, clean=True):
+    """Downloads a web page from a URL and processes its content."""
+    if not url.startswith("http://") and not url.startswith("https://"):
+        url = "http://" + url  # Prepend "http://" if not present
+    try:
+        response = get(url)
+        if response.status_code != 200:
+            return f"Error: Received status code {response.status_code}", 0
+        soup = response.soup()
+        structured_data = {
+            "Texts": extract_texts(soup),
+            "Links": extract_links(soup, url),
+            "Images": extract_images(soup, url)
+        }
+        return format_detailed_output(structured_data), 0
+    except urllib3.exceptions.HTTPError as e:
+        return f"Error: {e}", 0
+    except Exception as e:
+        return f"Error processing web page: {e}", 0
+# --- Gradio Interface ---
+iface = gr.Interface(
+    fn=download_and_process_web_page,
+    inputs=[
+        gr.Textbox(lines=1, placeholder="Enter URL of the web page"),
+        gr.Checkbox(label="Clean Text", value=True),
+    ],
+    outputs=[
+        gr.Markdown(label="Web Page Content"),
+        gr.Number(label="Content Length (characters)"),
+    ],
+    title="Enhanced Web Page Processor for Hugging Face Chat Tools",
+    description="Enter the URL of a web page. The tool will extract and format its content, including text, links, and images. This tool is designed for use with Hugging Face Chat Tools. \n [https://hf.co/chat/tools/66f1a8159d41ad4398ebb711](https://hf.co/chat/tools/66f1a8159d41ad4398ebb711)",
+    concurrency_limit=None,
+    api_name="main"
+)
+iface.launch()