File size: 4,330 Bytes
900966d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf6070d
900966d
 
 
 
 
 
 
bf6070d
900966d
 
 
 
 
 
 
bf6070d
900966d
 
bf6070d
900966d
bf6070d
900966d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import re
import urllib3
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import gradio as gr

# Constants
CHUNK_SIZE = 32000

# --- Custom HTTP Session and Response Classes ---

class CustomSession:
    def __init__(self):
        self.pool_manager = urllib3.PoolManager()

    def get(self, url):
        response = self.pool_manager.request('GET', url)
        return CustomResponse(response)

class CustomResponse:
    def __init__(self, response):
        self.status_code = response.status
        self.headers = response.headers
        self.content = response.data

    def soup(self):
        return BeautifulSoup(self.content, 'lxml')

    def clean_text(self):
        soup = self.soup()
        cleaned_text = soup.get_text().replace('\n', ' ').replace('\r', ' ').replace('  ', ' ')
        while '  ' in cleaned_text:
            cleaned_text = cleaned_text.replace('  ', ' ')
        return cleaned_text.strip()

def get(url):
    session = CustomSession()
    return session.get(url)

# --- Utility Functions ---

def extract_texts(soup):
    """Extracts all text content from the soup."""
    return [text for text in soup.stripped_strings]

def extract_links(soup, base_url):
    """Extracts all valid links from the soup."""
    links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
        link_text = link.get_text(strip=True) or "No Text"
        links.append({"Text": link_text, "URL": full_url})
    return links

def extract_images(soup, base_url):
    """Extracts all valid image URLs and their alt text from the soup."""
    images = []
    for img in soup.find_all('img', src=True):
        img_url = img['src']
        full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url
        alt_text = img.get('alt', 'No Alt Text')
        images.append({"Alt Text": alt_text, "Image URL": full_img_url})
    return images

def format_detailed_output(structured_data):
    """Formats the structured data into a Markdown string."""
    result = "### Structured Page Content\n\n"
    result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
    result += "**Links:**\n"
    if structured_data["Links"]:
        result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n"
    else:
        result += "No links found.\n"
    result += "**Images:**\n"
    if structured_data["Images"]:
        result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n"
    else:
        result += "No images found.\n"
    return result

# --- Web Page Processing Functions ---

def download_and_process_web_page(url):
    """Downloads a web page from a URL and processes its content."""
    if not url.startswith("http://") and not url.startswith("https://"):
        url = "http://" + url  # Prepend "http://" if not present

    try:
        response = get(url)
        if response.status_code != 200:
            return f"Error: Received status code {response.status_code}"

        soup = response.soup()
        structured_data = {
            "Texts": extract_texts(soup),
            "Links": extract_links(soup, url),
            "Images": extract_images(soup, url)
        }
        return format_detailed_output(structured_data)

    except urllib3.exceptions.HTTPError as e:
        return f"Error: {e}"
    except Exception as e:
        return f"Error processing web page: {e}"

# --- Gradio Interface ---

iface = gr.Interface(
    fn=download_and_process_web_page,
    inputs=[
        gr.Textbox(lines=1, placeholder="Enter URL of the web page"),
    ],
    outputs=[
        gr.Markdown(label="Web Page Content"),
    ],
    title="Enhanced Web Page Processor for Hugging Face Chat Tools",
    description="Enter the URL of a web page. The tool will extract and format its content, including text, links, and images. This tool is designed for use with Hugging Face Chat Tools. \n [https://hf.co/chat/tools/66f1a8159d41ad4398ebb711](https://hf.co/chat/tools/66f1a8159d41ad4398ebb711)",
    concurrency_limit=None,
    api_name="main"
)

iface.launch()