Spaces:

HuggingFaceFW-Dev
/

PDF-Extraction-Comparisson

Running

File size: 20,652 Bytes

import gradio as gr
import os
import json
import base64
import tempfile
from pathlib import Path

EXTRACTORS = ['pdf_plumber', 'py_pdf', 'docling', 'extractous', 'pypdfium2', 'pymupdf', 'pymupdf_llm']

def add_page_breaks(text, page_offsets):
    """Add page break markers to text based on page_offsets."""
    if not page_offsets:
        return text
    
    result = []
    last_offset = 0
    for offset in page_offsets:
        result.append(text[last_offset:offset])
        result.append("\n<---page-break--->\n")
        last_offset = offset
    
    # Add any remaining text
    if last_offset < len(text):
        result.append(text[last_offset:])
    
    return "".join(result)

class ExtractorComparer:
    def __init__(self):
        self.json_files = []
        self.current_index = 0
        self.current_data = None
        self.temp_pdf_path = None
        self.current_pdf_bytes = None
    
    def load_files(self, directory_path):
        """Load all JSON files from the specified directory."""
        self.json_files = []
        try:
            for filename in os.listdir(directory_path):
                if filename.endswith('.json') or filename.endswith('.jsonl'):
                    self.json_files.append(os.path.join(directory_path, filename))
            
            if self.json_files:
                self.current_index = 0
                file_progress, annotation_status = self.get_progress_info()
                return file_progress, annotation_status
            else:
                return "No JSON files found", "No files loaded"
        except Exception as e:
            return f"Error loading files: {str(e)}", "Error"
    
    def load_current_file(self):
        """Load the current JSON file data."""
        if not self.json_files:
            return None, "N/A", "N/A"
        
        try:
            with open(self.json_files[self.current_index], 'r') as f:
                self.current_data = json.load(f)
            
            # Extract PDF bytes from pdf_plumber
            pdf_bytes = None
            debug_info = ""
            if 'pdf_plumber' in self.current_data:
                plumber_data = self.current_data['pdf_plumber']
                if 'media' in plumber_data and plumber_data['media'] and isinstance(plumber_data['media'], list) and len(plumber_data['media']) > 0:
                    media_item = plumber_data['media'][0]
                    if 'media_bytes' in media_item and media_item['media_bytes']:
                        try:
                            pdf_bytes = base64.b64decode(media_item['media_bytes'])
                            self.current_pdf_bytes = pdf_bytes
                        except Exception as e:
                            debug_info = f"Error decoding media_bytes: {str(e)}"
            
            # Create temporary file for the PDF if we have bytes
            if pdf_bytes:
                if self.temp_pdf_path:
                    try:
                        os.remove(self.temp_pdf_path)
                    except:
                        pass
                
                with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
                    temp_file.write(pdf_bytes)
                    self.temp_pdf_path = temp_file.name
                
                # Convert to base64 for passing to the frontend
                base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')
                
                # Generate progress information
                file_progress, annotation_status = self.get_progress_info()
                
                return base64_pdf, file_progress, annotation_status
            else:
                file_progress, annotation_status = self.get_progress_info()
                return None, file_progress, annotation_status
        except Exception as e:
            return None, "Error loading file", "No annotation"
    
    def get_progress_info(self):
        """Generate progress information and annotation status."""
        if not self.json_files:
            return "No files loaded", "No annotation"
        
        current_file = self.json_files[self.current_index]
        filename = Path(current_file).name
        
        # File progress information
        file_progress = f"File {self.current_index + 1} of {len(self.json_files)}: {filename}"
        
        # Check if this file has been annotated with a best extractor
        best_extractor_file = os.path.splitext(current_file)[0] + "_best.txt"
        annotation_status = "Not annotated"
        
        if os.path.exists(best_extractor_file):
            try:
                with open(best_extractor_file, 'r') as f:
                    best_extractor = f.read().strip()
                annotation_status = f"Best extractor: {best_extractor}"
            except:
                pass
                
        # Count total annotated files
        annotated_count = 0
        for json_file in self.json_files:
            best_file = os.path.splitext(json_file)[0] + "_best.txt"
            if os.path.exists(best_file):
                annotated_count += 1
                
        file_progress = f"{file_progress} (Annotated: {annotated_count}/{len(self.json_files)})"
        
        return file_progress, annotation_status
    
    def get_extractor_text(self, extractor_name):
        """Get text with page breaks for the specified extractor."""
        if not self.current_data or extractor_name not in self.current_data:
            return ""
        
        extractor_data = self.current_data[extractor_name]
        if 'text' not in extractor_data:
            return f"No text found for {extractor_name}"
        
        text = extractor_data.get('text', '')
        
        # Get page offsets
        page_offsets = []
        if 'media' in extractor_data and extractor_data['media'] and len(extractor_data['media']) > 0:
            media_item = extractor_data['media'][0]
            if 'metadata' in media_item and 'pdf_metadata' in media_item['metadata'] and 'page_offsets' in media_item['metadata']['pdf_metadata']:
                page_offsets = media_item['metadata']['pdf_metadata']['page_offsets']
        
        return add_page_breaks(text, page_offsets)
    
    def next_pdf(self):
        """Load the next PDF in the list."""
        if not self.json_files:
            return None, "N/A", "N/A"
        
        self.current_index = (self.current_index + 1) % len(self.json_files)
        return self.load_current_file()
    
    def prev_pdf(self):
        """Load the previous PDF in the list."""
        if not self.json_files:
            return None, "N/A", "N/A"
        
        self.current_index = (self.current_index - 1) % len(self.json_files)
        return self.load_current_file()
    
    def set_best_extractor(self, extractor_name):
        """Record that this extractor is the best for the current file."""
        if not self.json_files or not self.current_data:
            return "N/A", "N/A"
        
        try:
            # Create a record about the best extractor
            result_file = os.path.splitext(self.json_files[self.current_index])[0] + "_best.txt"
            with open(result_file, 'w') as f:
                f.write(extractor_name)
            
            # Get updated progress info after annotation
            file_progress, annotation_status = self.get_progress_info()
            
            return file_progress, annotation_status
        except Exception as e:
            return "Error saving annotation", "No annotation"

def create_interface():
    comparer = ExtractorComparer()
    
    # Custom CSS for basic font in text areas
    custom_css = """
    .extraction-text textarea {
        font-family: Arial, Helvetica, sans-serif !important;
        font-size: 14px !important;
        line-height: 1.5 !important;
    }
    """
    
    with gr.Blocks(title="PDF Extractor Comparer", theme="soft", css=custom_css) as demo:
        gr.Markdown("## PDF Extractor Comparer")
        
        with gr.Row():
            directory_input = gr.Textbox(
                label="Path to JSON Directory",
                placeholder="e.g., /path/to/your/json/files"
            )
            load_button = gr.Button("Load PDFs", variant="primary")
        
        # Main layout: PDF viewer on left, status and controls on right
        with gr.Row():
            # Left column: PDF viewer
            with gr.Column(scale=3):
                # PDF viewer using iframe with JavaScript handling
                pdf_viewer_html = gr.HTML(
                    label="PDF Document",
                    value='''
                    <div style="width:100%; height:700px; position:relative; border:1px solid #ddd;">
                        <iframe id="pdf-iframe" width="100%" height="100%" style="border:none;" src="about:blank"></iframe>
                        <div id="pdf-fallback" style="position:absolute; top:0; left:0; width:100%; height:100%; 
                             display:flex; align-items:center; justify-content:center; padding:20px; text-align:center;">
                            Click "Load PDFs" to start viewing documents.
                        </div>
                    </div>
                    '''
                )
                # Hidden component to store the Base64 PDF data
                pdf_data_hidden = gr.Textbox(visible=False, elem_id="pdf_base64_data")
            
            # Right column: Progress and controls
            with gr.Column(scale=1):
                # Progress information
                file_progress_output = gr.Textbox(label="File Progress", interactive=False)
                annotation_status_output = gr.Textbox(label="Annotation Status", interactive=False)
                
                # Navigation
                with gr.Row():
                    prev_button = gr.Button("⬅️ Previous", elem_id="prev_button")
                    next_button = gr.Button("Next ➡️", elem_id="next_button")
                
                # Best extractor selection
                gr.Markdown("### Select Best Extractor")
                extractor_buttons = []
                for extractor in EXTRACTORS:
                    button = gr.Button(extractor, variant="secondary")
                    extractor_buttons.append(button)
                    button.click(
                        comparer.set_best_extractor, 
                        inputs=[gr.Textbox(value=extractor, visible=False)],
                        outputs=[file_progress_output, annotation_status_output]
                    )
        
        # Extractors section below the PDF
        gr.Markdown("### Extractor Comparison")
        
        # Extractor dropdowns
        with gr.Row():
            extractor1_dropdown = gr.Dropdown(
                choices=EXTRACTORS, 
                label="Extractor 1",
                value=EXTRACTORS[0] if EXTRACTORS else None
            )
            extractor2_dropdown = gr.Dropdown(
                choices=EXTRACTORS, 
                label="Extractor 2",
                value=EXTRACTORS[1] if len(EXTRACTORS) > 1 else EXTRACTORS[0] if EXTRACTORS else None
            )
        
        # Extractor text outputs with applied class for styling
        with gr.Row():
            extractor1_text = gr.Textbox(
                label="Extractor 1 Output", 
                lines=15,
                elem_classes=["extraction-text"]
            )
            extractor2_text = gr.Textbox(
                label="Extractor 2 Output", 
                lines=15,
                elem_classes=["extraction-text"]
            )
        
        # Event handlers
        load_button.click(
            comparer.load_files, 
            inputs=[directory_input],
            outputs=[file_progress_output, annotation_status_output]
        ).then(
            comparer.load_current_file,
            outputs=[pdf_data_hidden, file_progress_output, annotation_status_output]
        ).then(
            comparer.get_extractor_text,
            inputs=[extractor1_dropdown],
            outputs=[extractor1_text]
        ).then(
            comparer.get_extractor_text,
            inputs=[extractor2_dropdown],
            outputs=[extractor2_text]
        )
        
        prev_button.click(
            comparer.prev_pdf,
            outputs=[pdf_data_hidden, file_progress_output, annotation_status_output]
        ).then(
            comparer.get_extractor_text,
            inputs=[extractor1_dropdown],
            outputs=[extractor1_text]
        ).then(
            comparer.get_extractor_text,
            inputs=[extractor2_dropdown],
            outputs=[extractor2_text]
        )
        
        next_button.click(
            comparer.next_pdf,
            outputs=[pdf_data_hidden, file_progress_output, annotation_status_output]
        ).then(
            comparer.get_extractor_text,
            inputs=[extractor1_dropdown],
            outputs=[extractor1_text]
        ).then(
            comparer.get_extractor_text,
            inputs=[extractor2_dropdown],
            outputs=[extractor2_text]
        )
        
        extractor1_dropdown.change(
            comparer.get_extractor_text,
            inputs=[extractor1_dropdown],
            outputs=[extractor1_text]
        )
        
        extractor2_dropdown.change(
            comparer.get_extractor_text,
            inputs=[extractor2_dropdown],
            outputs=[extractor2_text]
        )
        
        # Add JavaScript to handle PDF display
        demo.load(
            None, None, None,
            js="""
            () => {
                console.log('Setting up PDF Blob handler...');
                window.currentPdfBlobUrl = null; // Store previous blob url globally

                function displayPdfBlob(base64Data) {
                    console.log('displayPdfBlob called - data length:', base64Data ? base64Data.length : 0);
                    const iframe = document.getElementById('pdf-iframe');
                    const fallbackDiv = document.getElementById('pdf-fallback');
                    
                    if (!iframe) {
                        console.error('PDF iframe not found!');
                        return;
                    }

                    // Revoke previous Blob URL to free memory
                    if (window.currentPdfBlobUrl) {
                        console.log('Revoking previous Blob URL:', window.currentPdfBlobUrl);
                        URL.revokeObjectURL(window.currentPdfBlobUrl);
                        window.currentPdfBlobUrl = null;
                    }

                    if (base64Data && base64Data.length > 100) { // Ensure there's actual content
                        try {
                            // Hide fallback message
                            if (fallbackDiv) fallbackDiv.style.display = 'none';
                            
                            // Decode Base64
                            const byteCharacters = atob(base64Data);
                            console.log('Base64 decoded successfully, length:', byteCharacters.length);
                            const byteNumbers = new Array(byteCharacters.length);
                            for (let i = 0; i < byteCharacters.length; i++) {
                                byteNumbers[i] = byteCharacters.charCodeAt(i);
                            }
                            const byteArray = new Uint8Array(byteNumbers);
                            
                            // Create Blob and URL
                            const blob = new Blob([byteArray], {type: 'application/pdf'});
                            window.currentPdfBlobUrl = URL.createObjectURL(blob);
                            console.log('Created new Blob URL:', window.currentPdfBlobUrl);
                            
                            // Update iframe source
                            iframe.src = window.currentPdfBlobUrl;
                            console.log('Iframe src updated to Blob URL');
                        } catch (e) {
                            console.error('Error processing Base64 data or creating Blob URL:', e);
                            if (fallbackDiv) {
                                fallbackDiv.innerHTML = '<div style="color:red;">Error loading PDF: ' + e.message + '</div>';
                                fallbackDiv.style.display = 'flex';
                            }
                            iframe.src = 'about:blank'; // Clear iframe on error
                        }
                    } else {
                        console.log('No valid Base64 data provided.');
                        if (fallbackDiv) {
                            fallbackDiv.innerHTML = '<div>No PDF loaded yet. Use the "Load PDFs" button.</div>';
                            fallbackDiv.style.display = 'flex';
                        }
                        iframe.src = 'about:blank'; // Clear iframe if no data
                    }
                }
                
                // MutationObserver to watch the hidden Textbox
                const targetNode = document.getElementById('pdf_base64_data');
                if (targetNode) {
                    // Find the actual textarea inside the Gradio component structure
                    const hiddenTextArea = targetNode.querySelector('textarea');
                    if(hiddenTextArea){
                        console.log('Found hidden textarea to observe.');
                        const observerConfig = { characterData: true, childList: true, subtree: true, attributes: true }; // Watch for all changes
                        
                        const observer = new MutationObserver(function(mutationsList) {
                            console.log('Mutation detected, checking textarea value');
                            if (hiddenTextArea.value && hiddenTextArea.value.length > 100) {
                                console.log('Valid value found in textarea, displaying PDF');
                                displayPdfBlob(hiddenTextArea.value);
                            }
                        });
                        
                        observer.observe(targetNode, observerConfig);
                        console.log('MutationObserver attached');

                        // Try to display any initial value
                        setTimeout(() => {
                            if(hiddenTextArea.value && hiddenTextArea.value.length > 100) { 
                                console.log('Initial value found in textarea, displaying PDF');
                                displayPdfBlob(hiddenTextArea.value); 
                            }
                        }, 1000);

                    } else {
                        console.error('Could not find the textarea within #pdf_base64_data!');
                    }
                } else {
                    console.error('Hidden data element #pdf_base64_data not found!');
                }
                
                // Add keyboard shortcuts like in app.py
                document.addEventListener('keydown', function(event) {
                    if (event.target.tagName === 'INPUT' || event.target.tagName === 'TEXTAREA') {
                        return; // Ignore inputs
                    }
                    let targetButtonId = null;
                    const key = event.key;
                    
                    if (key === 'ArrowLeft') targetButtonId = 'prev_button';
                    else if (key === 'ArrowRight') targetButtonId = 'next_button';

                    if (targetButtonId) {
                        const targetButton = document.getElementById(targetButtonId);
                        if (targetButton) {
                             event.preventDefault(); 
                             targetButton.click(); 
                        }
                    } 
                });
                console.log('Keydown listener added.');
                
                // Additional style for basic font
                const additionalStyle = document.createElement('style');
                additionalStyle.textContent = `
                    .extraction-text textarea {
                        font-family: Arial, Helvetica, sans-serif !important;
                        font-size: 14px !important;
                    }
                `;
                document.head.appendChild(additionalStyle);
            }
            """
        )
    
    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch()