File size: 3,290 Bytes
e67d52b e6312eb 0d9ed80 e67d52b 73a33e7 0d9ed80 084f54d 73a33e7 e67d52b 0d9ed80 73a33e7 e67d52b 0d9ed80 73a33e7 0d9ed80 73a33e7 0d9ed80 73a33e7 e67d52b 0d9b82c e6312eb 0d9b82c e6312eb 0d9b82c c2d6566 0d9b82c e67d52b 0d9ed80 73a33e7 e67d52b 0d9ed80 73a33e7 e67d52b 0d9ed80 73a33e7 0d9ed80 e67d52b e6312eb e67d52b e6312eb e67d52b e6312eb e67d52b 73a33e7 e67d52b 0d9ed80 e67d52b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import streamlit as st
from googlesearch import search
import requests
from bs4 import BeautifulSoup
import chunk # Import the chunking functionality from app2.py
# Function to perform Google search and return the first two links
def google_search(query):
try:
query = query + "/t site:https://medium.com/"
search_results = search(query, num_results=10) # Get up to 10 results
first_two_links = []
for i, link in enumerate(search_results):
if i < 2:
first_two_links.append(link)
else:
break
return first_two_links
except Exception as e:
st.error(f"An error occurred: {e}")
return None
# Function to fetch webpage content
def fetch_webpage_content(url):
try:
response = requests.get(url)
response.raise_for_status() # Check if the request was successful
return response.text
except Exception as e:
st.error(f"Failed to fetch the webpage content: {e}")
return None
# Function to scrape text from webpage content using BeautifulSoup
def scrape_text(webpage_content):
try:
soup = BeautifulSoup(webpage_content, 'html.parser')
# Remove all script and style elements
for script in soup(["script", "style"]):
script.decompose()
text = soup.get_text()
# Break the text into lines and remove leading/trailing spaces
lines = (line.strip() for line in text.splitlines())
# Break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# Drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
st.error(f"Failed to scrape text from webpage content: {e}")
return None
# Streamlit app UI
st.title("Search Link Finder")
# Input field for search query
query = st.text_input("Enter search query", "")
# Button to trigger search
if st.button("Search"):
if query:
first_two_links = google_search(query)
if first_two_links:
for i, link in enumerate(first_two_links):
st.success(f"Link {i+1}: [Click here]({link})")
# Fetch webpage content
webpage_content = fetch_webpage_content(link)
if webpage_content:
# Scrape text from webpage content
scraped_text = scrape_text(webpage_content)
if scraped_text:
st.write(f"Scraped Content from Link {i+1} (Chunked):")
# Call the chunking function from app2.py
chunk.display_chunks(scraped_text)
# Option to download the entire scraped content
st.download_button(
label=f"Download Full Webpage Content from Link {i+1}",
data=scraped_text,
file_name=f"webpage_content_{i+1}.txt",
mime="text/plain"
)
else:
st.warning("No results found")
else:
st.error("Please enter a query")
|