|
""" |
|
Text extraction module for Norwegian RAG chatbot. |
|
Extracts text from various document formats. |
|
""" |
|
|
|
import os |
|
import PyPDF2 |
|
from typing import List, Optional |
|
from bs4 import BeautifulSoup |
|
|
|
class TextExtractor: |
|
""" |
|
Extracts text from various document formats. |
|
Currently supports: |
|
- PDF (.pdf) |
|
- Text files (.txt) |
|
- HTML (.html, .htm) |
|
""" |
|
|
|
@staticmethod |
|
def extract_from_file(file_path: str) -> str: |
|
""" |
|
Extract text from a file based on its extension. |
|
|
|
Args: |
|
file_path: Path to the document file |
|
|
|
Returns: |
|
Extracted text content |
|
""" |
|
if not os.path.exists(file_path): |
|
raise FileNotFoundError(f"File not found: {file_path}") |
|
|
|
file_extension = os.path.splitext(file_path)[1].lower() |
|
|
|
if file_extension == '.pdf': |
|
return TextExtractor.extract_from_pdf(file_path) |
|
elif file_extension == '.txt': |
|
return TextExtractor.extract_from_text(file_path) |
|
elif file_extension in ['.html', '.htm']: |
|
return TextExtractor.extract_from_html(file_path) |
|
else: |
|
raise ValueError(f"Unsupported file format: {file_extension}") |
|
|
|
@staticmethod |
|
def extract_from_pdf(file_path: str) -> str: |
|
""" |
|
Extract text from a PDF file. |
|
|
|
Args: |
|
file_path: Path to the PDF file |
|
|
|
Returns: |
|
Extracted text content |
|
""" |
|
text = "" |
|
try: |
|
with open(file_path, 'rb') as file: |
|
pdf_reader = PyPDF2.PdfReader(file) |
|
for page_num in range(len(pdf_reader.pages)): |
|
page = pdf_reader.pages[page_num] |
|
text += page.extract_text() + "\n\n" |
|
except Exception as e: |
|
print(f"Error extracting text from PDF {file_path}: {str(e)}") |
|
return "" |
|
|
|
return text |
|
|
|
@staticmethod |
|
def extract_from_text(file_path: str) -> str: |
|
""" |
|
Extract text from a plain text file. |
|
|
|
Args: |
|
file_path: Path to the text file |
|
|
|
Returns: |
|
Extracted text content |
|
""" |
|
try: |
|
with open(file_path, 'r', encoding='utf-8') as file: |
|
return file.read() |
|
except UnicodeDecodeError: |
|
|
|
try: |
|
with open(file_path, 'r', encoding='latin-1') as file: |
|
return file.read() |
|
except Exception as e: |
|
print(f"Error extracting text from file {file_path}: {str(e)}") |
|
return "" |
|
except Exception as e: |
|
print(f"Error extracting text from file {file_path}: {str(e)}") |
|
return "" |
|
|
|
@staticmethod |
|
def extract_from_html(file_path: str) -> str: |
|
""" |
|
Extract text from an HTML file. |
|
|
|
Args: |
|
file_path: Path to the HTML file |
|
|
|
Returns: |
|
Extracted text content |
|
""" |
|
try: |
|
with open(file_path, 'r', encoding='utf-8') as file: |
|
html_content = file.read() |
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
|
|
|
|
for script in soup(["script", "style"]): |
|
script.extract() |
|
|
|
|
|
text = soup.get_text() |
|
|
|
|
|
lines = (line.strip() for line in text.splitlines()) |
|
|
|
|
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
|
|
|
|
|
text = '\n'.join(chunk for chunk in chunks if chunk) |
|
|
|
return text |
|
except Exception as e: |
|
print(f"Error extracting text from HTML {file_path}: {str(e)}") |
|
return "" |
|
|
|
@staticmethod |
|
def extract_from_url(url: str) -> str: |
|
""" |
|
Extract text from a web URL. |
|
|
|
Args: |
|
url: Web URL to extract text from |
|
|
|
Returns: |
|
Extracted text content |
|
""" |
|
try: |
|
import requests |
|
response = requests.get(url) |
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
for script in soup(["script", "style"]): |
|
script.extract() |
|
|
|
|
|
text = soup.get_text() |
|
|
|
|
|
lines = (line.strip() for line in text.splitlines()) |
|
|
|
|
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
|
|
|
|
|
text = '\n'.join(chunk for chunk in chunks if chunk) |
|
|
|
return text |
|
except Exception as e: |
|
print(f"Error extracting text from URL {url}: {str(e)}") |
|
return "" |
|
|