import os
import gradio as gr
import pandas as pd
from dockling_parser import DocumentParser
from dockling_parser.exceptions import ParserError, UnsupportedFormatError
import tempfile
import mimetypes
import traceback
import requests
from urllib.parse import urlparse

TITLE = "📄 Smart Document Parser"
DESCRIPTION = """
A powerful document parsing application that automatically extracts structured information from various document formats.
Upload a document or provide a URL (PDF, DOCX, TXT, HTML, Markdown) and get structured information automatically.
"""

ARTICLE = """
## 🚀 Features

- Multiple Format Support: PDF, DOCX, TXT, HTML, and Markdown
- Support for File Upload and URLs
- Rich Information Extraction
- Smart Processing with Confidence Scoring
- Automatic Format Detection

Made with ❤️ using Docling and Gradio
"""

ERROR_MESSAGES = {
    "no_input": (
        "⚠️ No input provided",
        "Please upload a document or provide a URL.",
        "No sections available",
        "No entities available",
        "Confidence Score: 0.0"
    ),
    "invalid_url": (
        "⚠️ Invalid URL",
        "Please provide a valid URL to a document.",
        "No sections available",
        "No entities available",
        "Confidence Score: 0.0"
    ),
    "download_error": (
        "⚠️ Failed to download document",
        "Could not download the document from the provided URL.",
        "No sections available",
        "No entities available",
        "Confidence Score: 0.0"
    ),
    "unsupported_format": (
        "⚠️ Unsupported file format",
        "Please provide a file in one of the supported formats: PDF, DOCX, TXT, HTML, or MD.",
        "No sections available",
        "No entities available",
        "Confidence Score: 0.0"
    ),
    "processing_error": (
        "⚠️ Error processing document",
        "An error occurred while processing the document. Please try again with a different file.",
        "No sections available",
        "No entities available",
        "Confidence Score: 0.0"
    )
}

# Initialize the document parser
parser = DocumentParser()

def download_file(url: str) -> str:
    """Download file from URL and save to temporary file"""
    try:
        # Extract filename from URL
        parsed_url = urlparse(url)
        filename = os.path.basename(parsed_url.path)
        if not filename:
            filename = "document.pdf"  # Default filename
        
        # Download file
        response = requests.get(url, allow_redirects=True)
        response.raise_for_status()
        
        # Save to temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file:
            tmp_file.write(response.content)
            return tmp_file.name
            
    except Exception as e:
        raise Exception(f"Failed to download file: {str(e)}")

def process_input(file_input, url_input):
    """Process either uploaded file or URL input"""
    # Check if we have any input
    if file_input is None and not url_input:
        return ERROR_MESSAGES["no_input"]
    
    temp_file = None
    try:
        # Handle URL input if provided
        if url_input:
            try:
                temp_file = download_file(url_input)
                result = parser.parse(temp_file)
            except Exception as e:
                return ERROR_MESSAGES["download_error"]
        # Handle file upload
        else:
            result = parser.parse(file_input)
        
        # Prepare the outputs
        metadata_df = pd.DataFrame([{
            "Property": k,
            "Value": str(v)
        } for k, v in result.metadata.dict().items()])
        
        # Extract structured content
        sections = result.structured_content.get('sections', [])
        sections_text = "\n\n".join([f"Section {i+1}:\n{section}" for i, section in enumerate(sections)])
        
        # Format entities if available
        entities = result.structured_content.get('entities', {})
        entities_text = "\n".join([f"{entity_type}: {', '.join(entities_list)}" 
                                 for entity_type, entities_list in entities.items()]) if entities else "No entities detected"
        
        return (
            result.content,  # Main content
            metadata_df,     # Metadata as table
            sections_text,   # Structured sections
            entities_text,   # Named entities
            f"Confidence Score: {result.confidence_score:.2f}"  # Confidence score
        )
        
    except UnsupportedFormatError as e:
        error_msg = f"⚠️ {str(e)}"
        return (
            error_msg,
            pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
            "No sections available",
            "No entities available",
            "Confidence Score: 0.0"
        )
    except ParserError as e:
        error_msg = f"⚠️ {str(e)}"
        return (
            error_msg,
            pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
            "No sections available",
            "No entities available",
            "Confidence Score: 0.0"
        )
    except Exception as e:
        error_msg = f"⚠️ Unexpected error: {str(e)}\n{traceback.format_exc()}"
        return (
            error_msg,
            pd.DataFrame([{"Property": "Error", "Value": error_msg}]),
            "No sections available",
            "No entities available",
            "Confidence Score: 0.0"
        )
    finally:
        # Cleanup temporary file if it was created
        if temp_file and os.path.exists(temp_file):
            try:
                os.unlink(temp_file)
            except:
                pass

# Create Gradio interface
with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
    gr.Markdown(f"# {TITLE}")
    gr.Markdown(DESCRIPTION)
    
    with gr.Row():
        with gr.Column():
            file_input = gr.File(
                label="Upload Document",
                file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
                type="filepath"
            )
            url_input = gr.Textbox(
                label="Or Enter Document URL",
                placeholder="https://example.com/document.pdf"
            )
            submit_btn = gr.Button("Process Document", variant="primary")
        
        with gr.Column():
            confidence = gr.Textbox(label="Processing Confidence")
    
    with gr.Tabs():
        with gr.TabItem("📝 Content"):
            content_output = gr.Textbox(
                label="Extracted Content",
                lines=10,
                max_lines=30
            )
            
        with gr.TabItem("📊 Metadata"):
            metadata_output = gr.Dataframe(
                label="Document Metadata",
                headers=["Property", "Value"]
            )
            
        with gr.TabItem("📑 Sections"):
            sections_output = gr.Textbox(
                label="Document Sections",
                lines=10,
                max_lines=30
            )
            
        with gr.TabItem("🏷️ Entities"):
            entities_output = gr.Textbox(
                label="Named Entities",
                lines=5,
                max_lines=15
            )
    
    # Handle file submission
    submit_btn.click(
        fn=process_input,
        inputs=[file_input, url_input],
        outputs=[
            content_output,
            metadata_output,
            sections_output,
            entities_output,
            confidence
        ]
    )
    
    gr.Markdown("""
    ### 📌 Supported Formats
    - PDF Documents (*.pdf)
    - Word Documents (*.docx)
    - Text Files (*.txt)
    - HTML Files (*.html)
    - Markdown Files (*.md)
    
    ### 🔗 Example URLs
    - ArXiv PDFs: https://arxiv.org/pdf/2408.08921.pdf
    - Research Papers
    - Documentation
    """)
    
    gr.Markdown(ARTICLE)

# Launch the app
if __name__ == "__main__":
    iface.launch()