import PyPDF2 import requests from bs4 import BeautifulSoup def get_text_from_website(url): response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') content_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'li']) text = ' '.join([element.get_text(separator=' ', strip=True) for element in content_elements]) return text else: return f"Failed to retrieve the webpage. Status code: {response.status_code}" def get_text_from_pdf(file_path): with open(file_path, 'rb') as pdf_file: pdf_reader = PyPDF2.PdfReader(pdf_file) text = '' for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text += page.extract_text() return text