Spaces:

KSh100
/

websearch

Sleeping

App Files Files Community

websearch / app.py

KSh100

Update app.py

bf6070d verified about 1 month ago

raw

history blame

4.33 kB

	import re
	import urllib3
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin
	import gradio as gr

	# Constants
	CHUNK_SIZE = 32000

	# --- Custom HTTP Session and Response Classes ---

	class CustomSession:
	def __init__(self):
	self.pool_manager = urllib3.PoolManager()

	def get(self, url):
	response = self.pool_manager.request('GET', url)
	return CustomResponse(response)

	class CustomResponse:
	def __init__(self, response):
	self.status_code = response.status
	self.headers = response.headers
	self.content = response.data

	def soup(self):
	return BeautifulSoup(self.content, 'lxml')

	def clean_text(self):
	soup = self.soup()
	cleaned_text = soup.get_text().replace('\n', ' ').replace('\r', ' ').replace(' ', ' ')
	while ' ' in cleaned_text:
	cleaned_text = cleaned_text.replace(' ', ' ')
	return cleaned_text.strip()

	def get(url):
	session = CustomSession()
	return session.get(url)

	# --- Utility Functions ---

	def extract_texts(soup):
	"""Extracts all text content from the soup."""
	return [text for text in soup.stripped_strings]

	def extract_links(soup, base_url):
	"""Extracts all valid links from the soup."""
	links = []
	for link in soup.find_all('a', href=True):
	href = link['href']
	full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
	link_text = link.get_text(strip=True) or "No Text"
	links.append({"Text": link_text, "URL": full_url})
	return links

	def extract_images(soup, base_url):
	"""Extracts all valid image URLs and their alt text from the soup."""
	images = []
	for img in soup.find_all('img', src=True):
	img_url = img['src']
	full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url
	alt_text = img.get('alt', 'No Alt Text')
	images.append({"Alt Text": alt_text, "Image URL": full_img_url})
	return images

	def format_detailed_output(structured_data):
	"""Formats the structured data into a Markdown string."""
	result = "### Structured Page Content\n\n"
	result += "Texts:\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
	result += "Links:\n"
	if structured_data["Links"]:
	result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n"
	else:
	result += "No links found.\n"
	result += "Images:\n"
	if structured_data["Images"]:
	result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n"
	else:
	result += "No images found.\n"
	return result

	# --- Web Page Processing Functions ---

	def download_and_process_web_page(url):
	"""Downloads a web page from a URL and processes its content."""
	if not url.startswith("http://") and not url.startswith("https://"):
	url = "http://" + url # Prepend "http://" if not present

	try:
	response = get(url)
	if response.status_code != 200:
	return f"Error: Received status code {response.status_code}"

	soup = response.soup()
	structured_data = {
	"Texts": extract_texts(soup),
	"Links": extract_links(soup, url),
	"Images": extract_images(soup, url)
	}
	return format_detailed_output(structured_data)

	except urllib3.exceptions.HTTPError as e:
	return f"Error: {e}"
	except Exception as e:
	return f"Error processing web page: {e}"

	# --- Gradio Interface ---

	iface = gr.Interface(
	fn=download_and_process_web_page,
	inputs=[
	gr.Textbox(lines=1, placeholder="Enter URL of the web page"),
	],
	outputs=[
	gr.Markdown(label="Web Page Content"),
	],
	title="Enhanced Web Page Processor for Hugging Face Chat Tools",
	description="Enter the URL of a web page. The tool will extract and format its content, including text, links, and images. This tool is designed for use with Hugging Face Chat Tools. \n [https://hf.co/chat/tools/66f1a8159d41ad4398ebb711](https://hf.co/chat/tools/66f1a8159d41ad4398ebb711)",
	concurrency_limit=None,
	api_name="main"
	)

	iface.launch()