Spaces:

hevold
/

iver

Sleeping

File size: 5,387 Bytes

b34efa5

"""
Text extraction module for Norwegian RAG chatbot.
Extracts text from various document formats.
"""

import os
import PyPDF2
from typing import List, Optional
from bs4 import BeautifulSoup

class TextExtractor:
    """
    Extracts text from various document formats.
    Currently supports:
    - PDF (.pdf)
    - Text files (.txt)
    - HTML (.html, .htm)
    """
    
    @staticmethod
    def extract_from_file(file_path: str) -> str:
        """
        Extract text from a file based on its extension.
        
        Args:
            file_path: Path to the document file
            
        Returns:
            Extracted text content
        """
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")
        
        file_extension = os.path.splitext(file_path)[1].lower()
        
        if file_extension == '.pdf':
            return TextExtractor.extract_from_pdf(file_path)
        elif file_extension == '.txt':
            return TextExtractor.extract_from_text(file_path)
        elif file_extension in ['.html', '.htm']:
            return TextExtractor.extract_from_html(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_extension}")
    
    @staticmethod
    def extract_from_pdf(file_path: str) -> str:
        """
        Extract text from a PDF file.
        
        Args:
            file_path: Path to the PDF file
            
        Returns:
            Extracted text content
        """
        text = ""
        try:
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    text += page.extract_text() + "\n\n"
        except Exception as e:
            print(f"Error extracting text from PDF {file_path}: {str(e)}")
            return ""
        
        return text
    
    @staticmethod
    def extract_from_text(file_path: str) -> str:
        """
        Extract text from a plain text file.
        
        Args:
            file_path: Path to the text file
            
        Returns:
            Extracted text content
        """
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except UnicodeDecodeError:
            # Try with different encoding if UTF-8 fails
            try:
                with open(file_path, 'r', encoding='latin-1') as file:
                    return file.read()
            except Exception as e:
                print(f"Error extracting text from file {file_path}: {str(e)}")
                return ""
        except Exception as e:
            print(f"Error extracting text from file {file_path}: {str(e)}")
            return ""
    
    @staticmethod
    def extract_from_html(file_path: str) -> str:
        """
        Extract text from an HTML file.
        
        Args:
            file_path: Path to the HTML file
            
        Returns:
            Extracted text content
        """
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                html_content = file.read()
                soup = BeautifulSoup(html_content, 'html.parser')
                
                # Remove script and style elements
                for script in soup(["script", "style"]):
                    script.extract()
                
                # Get text
                text = soup.get_text()
                
                # Break into lines and remove leading and trailing space on each
                lines = (line.strip() for line in text.splitlines())
                
                # Break multi-headlines into a line each
                chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
                
                # Drop blank lines
                text = '\n'.join(chunk for chunk in chunks if chunk)
                
                return text
        except Exception as e:
            print(f"Error extracting text from HTML {file_path}: {str(e)}")
            return ""
    
    @staticmethod
    def extract_from_url(url: str) -> str:
        """
        Extract text from a web URL.
        
        Args:
            url: Web URL to extract text from
            
        Returns:
            Extracted text content
        """
        try:
            import requests
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.extract()
            
            # Get text
            text = soup.get_text()
            
            # Break into lines and remove leading and trailing space on each
            lines = (line.strip() for line in text.splitlines())
            
            # Break multi-headlines into a line each
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            
            # Drop blank lines
            text = '\n'.join(chunk for chunk in chunks if chunk)
            
            return text
        except Exception as e:
            print(f"Error extracting text from URL {url}: {str(e)}")
            return ""