Spaces:
Sleeping
Sleeping
import PyPDF2 | |
import requests | |
from bs4 import BeautifulSoup | |
def get_text_from_website(url): | |
response = requests.get(url) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, 'html.parser') | |
content_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'li']) | |
text = ' '.join([element.get_text(separator=' ', strip=True) for element in content_elements]) | |
return text | |
else: | |
return f"Failed to retrieve the webpage. Status code: {response.status_code}" | |
def get_text_from_pdf(file_path): | |
with open(file_path, 'rb') as pdf_file: | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
text = '' | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text += page.extract_text() | |
return text |