KSh100 commited on
Commit
900966d
·
verified ·
1 Parent(s): 07d7acc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -0
app.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import urllib3
3
+ from bs4 import BeautifulSoup
4
+ from urllib.parse import urljoin
5
+ import gradio as gr
6
+
7
+ # Constants
8
+ CHUNK_SIZE = 32000
9
+
10
+ # --- Custom HTTP Session and Response Classes ---
11
+
12
+ class CustomSession:
13
+ def __init__(self):
14
+ self.pool_manager = urllib3.PoolManager()
15
+
16
+ def get(self, url):
17
+ response = self.pool_manager.request('GET', url)
18
+ return CustomResponse(response)
19
+
20
+ class CustomResponse:
21
+ def __init__(self, response):
22
+ self.status_code = response.status
23
+ self.headers = response.headers
24
+ self.content = response.data
25
+
26
+ def soup(self):
27
+ return BeautifulSoup(self.content, 'lxml')
28
+
29
+ def clean_text(self):
30
+ soup = self.soup()
31
+ cleaned_text = soup.get_text().replace('\n', ' ').replace('\r', ' ').replace(' ', ' ')
32
+ while ' ' in cleaned_text:
33
+ cleaned_text = cleaned_text.replace(' ', ' ')
34
+ return cleaned_text.strip()
35
+
36
+ def get(url):
37
+ session = CustomSession()
38
+ return session.get(url)
39
+
40
+ # --- Utility Functions ---
41
+
42
+ def extract_texts(soup):
43
+ """Extracts all text content from the soup."""
44
+ return [text for text in soup.stripped_strings]
45
+
46
+ def extract_links(soup, base_url):
47
+ """Extracts all valid links from the soup."""
48
+ links = []
49
+ for link in soup.find_all('a', href=True):
50
+ href = link['href']
51
+ full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
52
+ link_text = link.get_text(strip=True) or "No Text"
53
+ links.append({"Text": link_text, "URL": full_url})
54
+ return links
55
+
56
+ def extract_images(soup, base_url):
57
+ """Extracts all valid image URLs and their alt text from the soup."""
58
+ images = []
59
+ for img in soup.find_all('img', src=True):
60
+ img_url = img['src']
61
+ full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url
62
+ alt_text = img.get('alt', 'No Alt Text')
63
+ images.append({"Alt Text": alt_text, "Image URL": full_img_url})
64
+ return images
65
+
66
+ def format_detailed_output(structured_data):
67
+ """Formats the structured data into a Markdown string."""
68
+ result = "### Structured Page Content\n\n"
69
+ result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
70
+ result += "**Links:**\n"
71
+ if structured_data["Links"]:
72
+ result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n"
73
+ else:
74
+ result += "No links found.\n"
75
+ result += "**Images:**\n"
76
+ if structured_data["Images"]:
77
+ result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n"
78
+ else:
79
+ result += "No images found.\n"
80
+ return result
81
+
82
+ # --- Web Page Processing Functions ---
83
+
84
+ def download_and_process_web_page(url, clean=True):
85
+ """Downloads a web page from a URL and processes its content."""
86
+ if not url.startswith("http://") and not url.startswith("https://"):
87
+ url = "http://" + url # Prepend "http://" if not present
88
+
89
+ try:
90
+ response = get(url)
91
+ if response.status_code != 200:
92
+ return f"Error: Received status code {response.status_code}", 0
93
+
94
+ soup = response.soup()
95
+ structured_data = {
96
+ "Texts": extract_texts(soup),
97
+ "Links": extract_links(soup, url),
98
+ "Images": extract_images(soup, url)
99
+ }
100
+ return format_detailed_output(structured_data), 0
101
+
102
+ except urllib3.exceptions.HTTPError as e:
103
+ return f"Error: {e}", 0
104
+ except Exception as e:
105
+ return f"Error processing web page: {e}", 0
106
+
107
+ # --- Gradio Interface ---
108
+
109
+ iface = gr.Interface(
110
+ fn=download_and_process_web_page,
111
+ inputs=[
112
+ gr.Textbox(lines=1, placeholder="Enter URL of the web page"),
113
+ gr.Checkbox(label="Clean Text", value=True),
114
+ ],
115
+ outputs=[
116
+ gr.Markdown(label="Web Page Content"),
117
+ gr.Number(label="Content Length (characters)"),
118
+ ],
119
+ title="Enhanced Web Page Processor for Hugging Face Chat Tools",
120
+ description="Enter the URL of a web page. The tool will extract and format its content, including text, links, and images. This tool is designed for use with Hugging Face Chat Tools. \n [https://hf.co/chat/tools/66f1a8159d41ad4398ebb711](https://hf.co/chat/tools/66f1a8159d41ad4398ebb711)",
121
+ concurrency_limit=None,
122
+ api_name="main"
123
+ )
124
+
125
+ iface.launch()