GemmaGuard / src /extractor.py
Jay Prajapati
v1.0.0
8f885c1
raw
history blame contribute delete
802 Bytes
import PyPDF2
import requests
from bs4 import BeautifulSoup
def get_text_from_website(url):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
content_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'li'])
text = ' '.join([element.get_text(separator=' ', strip=True) for element in content_elements])
return text
else:
return f"Failed to retrieve the webpage. Status code: {response.status_code}"
def get_text_from_pdf(file_path):
with open(file_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ''
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
return text