Spaces:

KSh100
/

websearch

Sleeping

KSh100 commited on 23 days ago

Commit

a5ba9ea

verified ·

1 Parent(s): cf919e4

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -26,12 +26,12 @@ def get(url):
     return session.get(url)
 # Utility Functions
-def extract_texts(soup, page_title):
-    """Extracts all text content from the soup, excluding the page title."""
     texts = [text for text in soup.stripped_strings]
-    # Remove the page title from the texts if it exists
-    if page_title in texts:
-        texts.remove(page_title)
     return texts
 def extract_links(soup, base_url):
@@ -54,9 +54,9 @@ def extract_images(soup, base_url):
         images.append({"Alt Text": alt_text, "Image URL": full_img_url})
     return images
-def format_detailed_output(structured_data, page_title):
     """Formats the structured data into a Markdown string."""
-    result = f"## Page Title: {page_title}\n\n"
     result += "### Texts\n\n"
     result += " ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found."
     result += "\n\n### Links\n\n"
@@ -81,15 +81,15 @@ def download_and_process_web_page(url):
         response = get(url)
         soup = response.soup()
-        # Extract page title
-        page_title = soup.title.string if soup.title else "No Title Found"
         structured_data = {
-            "Texts": extract_texts(soup, page_title),
             "Links": extract_links(soup, url),
             "Images": extract_images(soup, url)
         }
-        return format_detailed_output(structured_data, page_title)
     except urllib3.exceptions.HTTPError as e:
         return f"Error: {e}"

     return session.get(url)
 # Utility Functions
+def extract_texts(soup, title):
+    """Extracts all text content from the soup, excluding the title."""
     texts = [text for text in soup.stripped_strings]
+    # Remove the title from the texts if it exists
+    if title in texts:
+        texts.remove(title)
     return texts
 def extract_links(soup, base_url):
         images.append({"Alt Text": alt_text, "Image URL": full_img_url})
     return images
+def format_detailed_output(structured_data, title):
     """Formats the structured data into a Markdown string."""
+    result = f"## Title\n\n{title}\n\n"
     result += "### Texts\n\n"
     result += " ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found."
     result += "\n\n### Links\n\n"
         response = get(url)
         soup = response.soup()
+        # Extract title
+        title = soup.title.string if soup.title else "No Title Found"
         structured_data = {
+            "Texts": extract_texts(soup, title),
             "Links": extract_links(soup, url),
             "Images": extract_images(soup, url)
         }
+        return format_detailed_output(structured_data, title)
     except urllib3.exceptions.HTTPError as e:
         return f"Error: {e}"