Spaces:

KSh100
/

websearch

Sleeping

KSh100 commited on 23 days ago

Commit

cf919e4

verified ·

1 Parent(s): 65c6434

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -26,9 +26,13 @@ def get(url):
     return session.get(url)
 # Utility Functions
-def extract_texts(soup):
-    """Extracts all text content from the soup."""
-    return [text for text in soup.stripped_strings]
 def extract_links(soup, base_url):
     """Extracts all valid links from the soup."""
@@ -81,7 +85,7 @@ def download_and_process_web_page(url):
         page_title = soup.title.string if soup.title else "No Title Found"
         structured_data = {
-            "Texts": extract_texts(soup),
             "Links": extract_links(soup, url),
             "Images": extract_images(soup, url)
         }

     return session.get(url)
 # Utility Functions
+def extract_texts(soup, page_title):
+    """Extracts all text content from the soup, excluding the page title."""
+    texts = [text for text in soup.stripped_strings]
+    # Remove the page title from the texts if it exists
+    if page_title in texts:
+        texts.remove(page_title)
+    return texts
 def extract_links(soup, base_url):
     """Extracts all valid links from the soup."""
         page_title = soup.title.string if soup.title else "No Title Found"
         structured_data = {
+            "Texts": extract_texts(soup, page_title),
             "Links": extract_links(soup, url),
             "Images": extract_images(soup, url)
         }