File size: 5,387 Bytes
b34efa5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
"""
Text extraction module for Norwegian RAG chatbot.
Extracts text from various document formats.
"""
import os
import PyPDF2
from typing import List, Optional
from bs4 import BeautifulSoup
class TextExtractor:
"""
Extracts text from various document formats.
Currently supports:
- PDF (.pdf)
- Text files (.txt)
- HTML (.html, .htm)
"""
@staticmethod
def extract_from_file(file_path: str) -> str:
"""
Extract text from a file based on its extension.
Args:
file_path: Path to the document file
Returns:
Extracted text content
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension == '.pdf':
return TextExtractor.extract_from_pdf(file_path)
elif file_extension == '.txt':
return TextExtractor.extract_from_text(file_path)
elif file_extension in ['.html', '.htm']:
return TextExtractor.extract_from_html(file_path)
else:
raise ValueError(f"Unsupported file format: {file_extension}")
@staticmethod
def extract_from_pdf(file_path: str) -> str:
"""
Extract text from a PDF file.
Args:
file_path: Path to the PDF file
Returns:
Extracted text content
"""
text = ""
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text() + "\n\n"
except Exception as e:
print(f"Error extracting text from PDF {file_path}: {str(e)}")
return ""
return text
@staticmethod
def extract_from_text(file_path: str) -> str:
"""
Extract text from a plain text file.
Args:
file_path: Path to the text file
Returns:
Extracted text content
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except UnicodeDecodeError:
# Try with different encoding if UTF-8 fails
try:
with open(file_path, 'r', encoding='latin-1') as file:
return file.read()
except Exception as e:
print(f"Error extracting text from file {file_path}: {str(e)}")
return ""
except Exception as e:
print(f"Error extracting text from file {file_path}: {str(e)}")
return ""
@staticmethod
def extract_from_html(file_path: str) -> str:
"""
Extract text from an HTML file.
Args:
file_path: Path to the HTML file
Returns:
Extracted text content
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.extract()
# Get text
text = soup.get_text()
# Break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# Break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# Drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
print(f"Error extracting text from HTML {file_path}: {str(e)}")
return ""
@staticmethod
def extract_from_url(url: str) -> str:
"""
Extract text from a web URL.
Args:
url: Web URL to extract text from
Returns:
Extracted text content
"""
try:
import requests
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.extract()
# Get text
text = soup.get_text()
# Break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# Break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# Drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
print(f"Error extracting text from URL {url}: {str(e)}")
return ""
|