File size: 3,252 Bytes
e67d52b 04e30ca 0d9ed80 04e30ca 73a33e7 0d9ed80 04e30ca e67d52b 0d9ed80 73a33e7 e67d52b 0d9ed80 73a33e7 0d9ed80 73a33e7 0d9ed80 73a33e7 e67d52b 0d9b82c 04e30ca 0d9b82c c2d6566 0d9b82c e67d52b 0d9ed80 73a33e7 04e30ca 0d9ed80 73a33e7 e67d52b 0d9ed80 73a33e7 0d9ed80 e67d52b 04e30ca e6312eb e67d52b 04e30ca ca30905 04e30ca ca30905 e67d52b 04e30ca ca30905 04e30ca e67d52b 73a33e7 e67d52b 0d9ed80 e67d52b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import streamlit as st
from googlesearch import search
import requests
from bs4 import BeautifulSoup
import chunk # Import the chunking function from chunk.py
# Function to perform Google search and return the first link
def google_search(query):
try:
search_results = search(query, num_results=2) # Get first two results
first_two_links = [next(search_results, None), next(search_results, None)]
return first_two_links
except Exception as e:
st.error(f"An error occurred: {e}")
return None
# Function to fetch webpage content
def fetch_webpage_content(url):
try:
response = requests.get(url)
response.raise_for_status() # Check if the request was successful
return response.text
except Exception as e:
st.error(f"Failed to fetch the webpage content: {e}")
return None
# Function to scrape text from webpage content using Beautiful Soup
def scrape_text(webpage_content):
try:
soup = BeautifulSoup(webpage_content, 'html.parser')
for script in soup(["script", "style"]):
script.decompose()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
st.error(f"Failed to scrape text from webpage content: {e}")
return None
# Streamlit app UI
st.title("Search and Chunk Webpage Content")
# Input field for search query
query = st.text_input("Enter search query", "")
# Button to trigger search
if st.button("Search"):
if query:
first_two_links = google_search(query)
if first_two_links:
for i, link in enumerate(first_two_links, 1):
st.success(f"Link {i}: [Click here]({link})")
# Fetch webpage content
webpage_content = fetch_webpage_content(link)
if webpage_content:
# Scrape text from webpage content
scraped_text = scrape_text(webpage_content)
if scraped_text:
# Chunk the scraped text using chunk.py
chunked_text = chunk.chunk_text(scraped_text)
# Save chunked data to a .txt file for later use
with open("chunked_data.txt", "w") as f:
f.write("\n---\n".join(chunked_text)) # Separate chunks by a line break and delimiter
st.write(f"Chunked Data for Link {i}:")
for chunk_part in chunked_text:
st.write(chunk_part)
# Provide download button for the chunked text
st.download_button(
label="Download Chunked Webpage Content",
data="\n---\n".join(chunked_text),
file_name="chunked_webpage_content.txt",
mime="text/plain"
)
else:
st.warning("No results found")
else:
st.error("Please enter a query")
|