"""
Text extraction module for Norwegian RAG chatbot.
Extracts text from various document formats.
"""

import os
import PyPDF2
from typing import List, Optional
from bs4 import BeautifulSoup

class TextExtractor:
    """
    Extracts text from various document formats.
    Currently supports:
    - PDF (.pdf)
    - Text files (.txt)
    - HTML (.html, .htm)
    """
    
    @staticmethod
    def extract_from_file(file_path: str) -> str:
        """
        Extract text from a file based on its extension.
        
        Args:
            file_path: Path to the document file
            
        Returns:
            Extracted text content
        """
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")
        
        file_extension = os.path.splitext(file_path)[1].lower()
        
        if file_extension == '.pdf':
            return TextExtractor.extract_from_pdf(file_path)
        elif file_extension == '.txt':
            return TextExtractor.extract_from_text(file_path)
        elif file_extension in ['.html', '.htm']:
            return TextExtractor.extract_from_html(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_extension}")
    
    @staticmethod
    def extract_from_pdf(file_path: str) -> str:
        """
        Extract text from a PDF file.
        
        Args:
            file_path: Path to the PDF file
            
        Returns:
            Extracted text content
        """
        text = ""
        try:
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    text += page.extract_text() + "\n\n"
        except Exception as e:
            print(f"Error extracting text from PDF {file_path}: {str(e)}")
            return ""
        
        return text
    
    @staticmethod
    def extract_from_text(file_path: str) -> str:
        """
        Extract text from a plain text file.
        
        Args:
            file_path: Path to the text file
            
        Returns:
            Extracted text content
        """
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except UnicodeDecodeError:
            # Try with different encoding if UTF-8 fails
            try:
                with open(file_path, 'r', encoding='latin-1') as file:
                    return file.read()
            except Exception as e:
                print(f"Error extracting text from file {file_path}: {str(e)}")
                return ""
        except Exception as e:
            print(f"Error extracting text from file {file_path}: {str(e)}")
            return ""
    
    @staticmethod
    def extract_from_html(file_path: str) -> str:
        """
        Extract text from an HTML file.
        
        Args:
            file_path: Path to the HTML file
            
        Returns:
            Extracted text content
        """
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                html_content = file.read()
                soup = BeautifulSoup(html_content, 'html.parser')
                
                # Remove script and style elements
                for script in soup(["script", "style"]):
                    script.extract()
                
                # Get text
                text = soup.get_text()
                
                # Break into lines and remove leading and trailing space on each
                lines = (line.strip() for line in text.splitlines())
                
                # Break multi-headlines into a line each
                chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
                
                # Drop blank lines
                text = '\n'.join(chunk for chunk in chunks if chunk)
                
                return text
        except Exception as e:
            print(f"Error extracting text from HTML {file_path}: {str(e)}")
            return ""
    
    @staticmethod
    def extract_from_url(url: str) -> str:
        """
        Extract text from a web URL.
        
        Args:
            url: Web URL to extract text from
            
        Returns:
            Extracted text content
        """
        try:
            import requests
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.extract()
            
            # Get text
            text = soup.get_text()
            
            # Break into lines and remove leading and trailing space on each
            lines = (line.strip() for line in text.splitlines())
            
            # Break multi-headlines into a line each
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            
            # Drop blank lines
            text = '\n'.join(chunk for chunk in chunks if chunk)
            
            return text
        except Exception as e:
            print(f"Error extracting text from URL {url}: {str(e)}")
            return ""