KSh100 commited on
Commit
a5ba9ea
·
verified ·
1 Parent(s): cf919e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -11
app.py CHANGED
@@ -26,12 +26,12 @@ def get(url):
26
  return session.get(url)
27
 
28
  # Utility Functions
29
- def extract_texts(soup, page_title):
30
- """Extracts all text content from the soup, excluding the page title."""
31
  texts = [text for text in soup.stripped_strings]
32
- # Remove the page title from the texts if it exists
33
- if page_title in texts:
34
- texts.remove(page_title)
35
  return texts
36
 
37
  def extract_links(soup, base_url):
@@ -54,9 +54,9 @@ def extract_images(soup, base_url):
54
  images.append({"Alt Text": alt_text, "Image URL": full_img_url})
55
  return images
56
 
57
- def format_detailed_output(structured_data, page_title):
58
  """Formats the structured data into a Markdown string."""
59
- result = f"## Page Title: {page_title}\n\n"
60
  result += "### Texts\n\n"
61
  result += " ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found."
62
  result += "\n\n### Links\n\n"
@@ -81,15 +81,15 @@ def download_and_process_web_page(url):
81
  response = get(url)
82
  soup = response.soup()
83
 
84
- # Extract page title
85
- page_title = soup.title.string if soup.title else "No Title Found"
86
 
87
  structured_data = {
88
- "Texts": extract_texts(soup, page_title),
89
  "Links": extract_links(soup, url),
90
  "Images": extract_images(soup, url)
91
  }
92
- return format_detailed_output(structured_data, page_title)
93
 
94
  except urllib3.exceptions.HTTPError as e:
95
  return f"Error: {e}"
 
26
  return session.get(url)
27
 
28
  # Utility Functions
29
+ def extract_texts(soup, title):
30
+ """Extracts all text content from the soup, excluding the title."""
31
  texts = [text for text in soup.stripped_strings]
32
+ # Remove the title from the texts if it exists
33
+ if title in texts:
34
+ texts.remove(title)
35
  return texts
36
 
37
  def extract_links(soup, base_url):
 
54
  images.append({"Alt Text": alt_text, "Image URL": full_img_url})
55
  return images
56
 
57
+ def format_detailed_output(structured_data, title):
58
  """Formats the structured data into a Markdown string."""
59
+ result = f"## Title\n\n{title}\n\n"
60
  result += "### Texts\n\n"
61
  result += " ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found."
62
  result += "\n\n### Links\n\n"
 
81
  response = get(url)
82
  soup = response.soup()
83
 
84
+ # Extract title
85
+ title = soup.title.string if soup.title else "No Title Found"
86
 
87
  structured_data = {
88
+ "Texts": extract_texts(soup, title),
89
  "Links": extract_links(soup, url),
90
  "Images": extract_images(soup, url)
91
  }
92
+ return format_detailed_output(structured_data, title)
93
 
94
  except urllib3.exceptions.HTTPError as e:
95
  return f"Error: {e}"