KSh100 commited on
Commit
cf919e4
·
verified ·
1 Parent(s): 65c6434

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -4
app.py CHANGED
@@ -26,9 +26,13 @@ def get(url):
26
  return session.get(url)
27
 
28
  # Utility Functions
29
- def extract_texts(soup):
30
- """Extracts all text content from the soup."""
31
- return [text for text in soup.stripped_strings]
 
 
 
 
32
 
33
  def extract_links(soup, base_url):
34
  """Extracts all valid links from the soup."""
@@ -81,7 +85,7 @@ def download_and_process_web_page(url):
81
  page_title = soup.title.string if soup.title else "No Title Found"
82
 
83
  structured_data = {
84
- "Texts": extract_texts(soup),
85
  "Links": extract_links(soup, url),
86
  "Images": extract_images(soup, url)
87
  }
 
26
  return session.get(url)
27
 
28
  # Utility Functions
29
+ def extract_texts(soup, page_title):
30
+ """Extracts all text content from the soup, excluding the page title."""
31
+ texts = [text for text in soup.stripped_strings]
32
+ # Remove the page title from the texts if it exists
33
+ if page_title in texts:
34
+ texts.remove(page_title)
35
+ return texts
36
 
37
  def extract_links(soup, base_url):
38
  """Extracts all valid links from the soup."""
 
85
  page_title = soup.title.string if soup.title else "No Title Found"
86
 
87
  structured_data = {
88
+ "Texts": extract_texts(soup, page_title),
89
  "Links": extract_links(soup, url),
90
  "Images": extract_images(soup, url)
91
  }