hevold's picture
Upload 29 files
b34efa5 verified
"""
Text extraction module for Norwegian RAG chatbot.
Extracts text from various document formats.
"""
import os
import PyPDF2
from typing import List, Optional
from bs4 import BeautifulSoup
class TextExtractor:
"""
Extracts text from various document formats.
Currently supports:
- PDF (.pdf)
- Text files (.txt)
- HTML (.html, .htm)
"""
@staticmethod
def extract_from_file(file_path: str) -> str:
"""
Extract text from a file based on its extension.
Args:
file_path: Path to the document file
Returns:
Extracted text content
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension == '.pdf':
return TextExtractor.extract_from_pdf(file_path)
elif file_extension == '.txt':
return TextExtractor.extract_from_text(file_path)
elif file_extension in ['.html', '.htm']:
return TextExtractor.extract_from_html(file_path)
else:
raise ValueError(f"Unsupported file format: {file_extension}")
@staticmethod
def extract_from_pdf(file_path: str) -> str:
"""
Extract text from a PDF file.
Args:
file_path: Path to the PDF file
Returns:
Extracted text content
"""
text = ""
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text() + "\n\n"
except Exception as e:
print(f"Error extracting text from PDF {file_path}: {str(e)}")
return ""
return text
@staticmethod
def extract_from_text(file_path: str) -> str:
"""
Extract text from a plain text file.
Args:
file_path: Path to the text file
Returns:
Extracted text content
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except UnicodeDecodeError:
# Try with different encoding if UTF-8 fails
try:
with open(file_path, 'r', encoding='latin-1') as file:
return file.read()
except Exception as e:
print(f"Error extracting text from file {file_path}: {str(e)}")
return ""
except Exception as e:
print(f"Error extracting text from file {file_path}: {str(e)}")
return ""
@staticmethod
def extract_from_html(file_path: str) -> str:
"""
Extract text from an HTML file.
Args:
file_path: Path to the HTML file
Returns:
Extracted text content
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.extract()
# Get text
text = soup.get_text()
# Break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# Break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# Drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
print(f"Error extracting text from HTML {file_path}: {str(e)}")
return ""
@staticmethod
def extract_from_url(url: str) -> str:
"""
Extract text from a web URL.
Args:
url: Web URL to extract text from
Returns:
Extracted text content
"""
try:
import requests
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.extract()
# Get text
text = soup.get_text()
# Break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# Break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# Drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
print(f"Error extracting text from URL {url}: {str(e)}")
return ""