Makima57 commited on
Commit
e6312eb
·
verified ·
1 Parent(s): e67d52b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -11
app.py CHANGED
@@ -2,16 +2,14 @@ import streamlit as st
2
  from googlesearch import search
3
  import requests
4
  from bs4 import BeautifulSoup
 
5
 
6
  # Function to perform Google search and return the first two links
7
  def google_search(query):
8
  try:
9
  query = query + "/t site:https://medium.com/"
10
- # Perform the search and get an iterator of results
11
  search_results = search(query, num_results=10) # Get up to 10 results
12
  first_two_links = []
13
-
14
- # Get the first two results
15
  for i, link in enumerate(search_results):
16
  if i < 2:
17
  first_two_links.append(link)
@@ -32,16 +30,15 @@ def fetch_webpage_content(url):
32
  st.error(f"Failed to fetch the webpage content: {e}")
33
  return None
34
 
35
- # Function to scrape text from webpage content using Beautiful Soup
36
  def scrape_text(webpage_content):
37
  try:
38
  soup = BeautifulSoup(webpage_content, 'html.parser')
39
  # Remove all script and style elements
40
  for script in soup(["script", "style"]):
41
  script.decompose()
42
- # Get the text from the BeautifulSoup object
43
  text = soup.get_text()
44
- # Break the text into lines and remove leading and trailing space on each
45
  lines = (line.strip() for line in text.splitlines())
46
  # Break multi-headlines into a line each
47
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
@@ -65,18 +62,21 @@ if st.button("Search"):
65
  if first_two_links:
66
  for i, link in enumerate(first_two_links):
67
  st.success(f"Link {i+1}: [Click here]({link})")
68
-
69
  # Fetch webpage content
70
  webpage_content = fetch_webpage_content(link)
71
  if webpage_content:
72
  # Scrape text from webpage content
73
  scraped_text = scrape_text(webpage_content)
74
  if scraped_text:
75
- st.write(f"Scraped Content from Link {i+1}:")
76
- st.write(scraped_text)
77
- # Download button for the webpage content
 
 
 
78
  st.download_button(
79
- label=f"Download Webpage Content from Link {i+1}",
80
  data=scraped_text,
81
  file_name=f"webpage_content_{i+1}.txt",
82
  mime="text/plain"
 
2
  from googlesearch import search
3
  import requests
4
  from bs4 import BeautifulSoup
5
+ import chunk # Import the chunking functionality from app2.py
6
 
7
  # Function to perform Google search and return the first two links
8
  def google_search(query):
9
  try:
10
  query = query + "/t site:https://medium.com/"
 
11
  search_results = search(query, num_results=10) # Get up to 10 results
12
  first_two_links = []
 
 
13
  for i, link in enumerate(search_results):
14
  if i < 2:
15
  first_two_links.append(link)
 
30
  st.error(f"Failed to fetch the webpage content: {e}")
31
  return None
32
 
33
+ # Function to scrape text from webpage content using BeautifulSoup
34
  def scrape_text(webpage_content):
35
  try:
36
  soup = BeautifulSoup(webpage_content, 'html.parser')
37
  # Remove all script and style elements
38
  for script in soup(["script", "style"]):
39
  script.decompose()
 
40
  text = soup.get_text()
41
+ # Break the text into lines and remove leading/trailing spaces
42
  lines = (line.strip() for line in text.splitlines())
43
  # Break multi-headlines into a line each
44
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
 
62
  if first_two_links:
63
  for i, link in enumerate(first_two_links):
64
  st.success(f"Link {i+1}: [Click here]({link})")
65
+
66
  # Fetch webpage content
67
  webpage_content = fetch_webpage_content(link)
68
  if webpage_content:
69
  # Scrape text from webpage content
70
  scraped_text = scrape_text(webpage_content)
71
  if scraped_text:
72
+ st.write(f"Scraped Content from Link {i+1} (Chunked):")
73
+
74
+ # Call the chunking function from app2.py
75
+ chunk.display_chunks(scraped_text)
76
+
77
+ # Option to download the entire scraped content
78
  st.download_button(
79
+ label=f"Download Full Webpage Content from Link {i+1}",
80
  data=scraped_text,
81
  file_name=f"webpage_content_{i+1}.txt",
82
  mime="text/plain"