""" Text extraction module for Norwegian RAG chatbot. Extracts text from various document formats. """ import os import PyPDF2 from typing import List, Optional from bs4 import BeautifulSoup class TextExtractor: """ Extracts text from various document formats. Currently supports: - PDF (.pdf) - Text files (.txt) - HTML (.html, .htm) """ @staticmethod def extract_from_file(file_path: str) -> str: """ Extract text from a file based on its extension. Args: file_path: Path to the document file Returns: Extracted text content """ if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") file_extension = os.path.splitext(file_path)[1].lower() if file_extension == '.pdf': return TextExtractor.extract_from_pdf(file_path) elif file_extension == '.txt': return TextExtractor.extract_from_text(file_path) elif file_extension in ['.html', '.htm']: return TextExtractor.extract_from_html(file_path) else: raise ValueError(f"Unsupported file format: {file_extension}") @staticmethod def extract_from_pdf(file_path: str) -> str: """ Extract text from a PDF file. Args: file_path: Path to the PDF file Returns: Extracted text content """ text = "" try: with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text += page.extract_text() + "\n\n" except Exception as e: print(f"Error extracting text from PDF {file_path}: {str(e)}") return "" return text @staticmethod def extract_from_text(file_path: str) -> str: """ Extract text from a plain text file. Args: file_path: Path to the text file Returns: Extracted text content """ try: with open(file_path, 'r', encoding='utf-8') as file: return file.read() except UnicodeDecodeError: # Try with different encoding if UTF-8 fails try: with open(file_path, 'r', encoding='latin-1') as file: return file.read() except Exception as e: print(f"Error extracting text from file {file_path}: {str(e)}") return "" except Exception as e: print(f"Error extracting text from file {file_path}: {str(e)}") return "" @staticmethod def extract_from_html(file_path: str) -> str: """ Extract text from an HTML file. Args: file_path: Path to the HTML file Returns: Extracted text content """ try: with open(file_path, 'r', encoding='utf-8') as file: html_content = file.read() soup = BeautifulSoup(html_content, 'html.parser') # Remove script and style elements for script in soup(["script", "style"]): script.extract() # Get text text = soup.get_text() # Break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # Break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # Drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) return text except Exception as e: print(f"Error extracting text from HTML {file_path}: {str(e)}") return "" @staticmethod def extract_from_url(url: str) -> str: """ Extract text from a web URL. Args: url: Web URL to extract text from Returns: Extracted text content """ try: import requests response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') # Remove script and style elements for script in soup(["script", "style"]): script.extract() # Get text text = soup.get_text() # Break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # Break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # Drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) return text except Exception as e: print(f"Error extracting text from URL {url}: {str(e)}") return ""