Spaces:

darylalim
/

granite-document-summarization

Sleeping

App Files Files Community

Daryl Lim commited on 5 days ago

Commit

ca55264

1 Parent(s): 15864df

Update app.py

Browse files

Files changed (1) hide show

app.py +215 -195

app.py CHANGED Viewed

@@ -4,13 +4,13 @@ import shutil
 import torch
 import gradio as gr
 from pathlib import Path
-from typing import Optional, List, Dict, Any, Union
-import requests
-from urllib.parse import urlparse
 # Docling imports
 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption, SimplePipeline
 # LangChain imports
@@ -26,16 +26,22 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 # Initialize IBM Granite model and tokenizer
 print("Loading Granite model and tokenizer...")
-tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-3.2-8b-instruct")
 model = AutoModelForCausalLM.from_pretrained(
-    "ibm-granite/granite-3.2-8b-instruct",
     device_map="auto",
-    torch_dtype=torch.bfloat16
 )
 print("Model loaded successfully!")
 # Helper function to detect document format
-def get_document_format(file_path) -> InputFormat:
     """Determine the document format based on file extension"""
     try:
         file_path = str(file_path)
@@ -48,9 +54,10 @@ def get_document_format(file_path) -> InputFormat:
             '.html': InputFormat.HTML,
             '.htm': InputFormat.HTML
         }
-        return format_map.get(extension, None)
     except Exception as e:
-        return f"Error in get_document_format: {str(e)}"
 # Function to convert documents to markdown
 def convert_document_to_markdown(doc_path) -> str:
@@ -59,16 +66,19 @@ def convert_document_to_markdown(doc_path) -> str:
         # Convert to absolute path string
         input_path = os.path.abspath(str(doc_path))
         print(f"Converting document: {doc_path}")
         # Create temporary directory for processing
         with tempfile.TemporaryDirectory() as temp_dir:
             # Copy input file to temp directory
             temp_input = os.path.join(temp_dir, os.path.basename(input_path))
             shutil.copy2(input_path, temp_input)
             # Configure pipeline options
             pipeline_options = PdfPipelineOptions()
-            pipeline_options.do_ocr = False  # Disable OCR temporarily
             pipeline_options.do_table_structure = True
-            # Create converter with minimal options
             converter = DocumentConverter(
                 allowed_formats=[
                     InputFormat.PDF,
@@ -85,104 +95,66 @@ def convert_document_to_markdown(doc_path) -> str:
                     )
                 }
             )
             # Convert document
             print("Starting conversion...")
             conv_result = converter.convert(temp_input)
             if not conv_result or not conv_result.document:
                 raise ValueError(f"Failed to convert document: {doc_path}")
             # Export to markdown
             print("Exporting to markdown...")
             md = conv_result.document.export_to_markdown()
             # Create output path
             output_dir = os.path.dirname(input_path)
             base_name = os.path.splitext(os.path.basename(input_path))[0]
             md_path = os.path.join(output_dir, f"{base_name}_converted.md")
             # Write markdown file
-            print(f"Writing markdown to: {base_name}_converted.md")
             with open(md_path, "w", encoding="utf-8") as fp:
                 fp.write(md)
             return md_path
     except Exception as e:
         return f"Error converting document: {str(e)}"
-# Function to download file from URL
-def download_file_from_url(url: str) -> Optional[str]:
-    """Download a file from a URL and save it temporarily"""
-    try:
-        # Parse URL to get filename
-        parsed_url = urlparse(url)
-        filename = os.path.basename(parsed_url.path)
-        if not filename:
-            filename = "downloaded_document"
-        # Add extension based on Content-Type if needed
-        response = requests.get(url, stream=True)
-        response.raise_for_status()
-        content_type = response.headers.get('Content-Type', '')
-        if 'pdf' in content_type:
-            if not filename.lower().endswith('.pdf'):
-                filename += ".pdf"
-        elif 'word' in content_type or 'docx' in content_type:
-            if not filename.lower().endswith(('.doc', '.docx')):
-                filename += ".docx"
-        elif 'powerpoint' in content_type or 'pptx' in content_type:
-            if not filename.lower().endswith(('.ppt', '.pptx')):
-                filename += ".pptx"
-        elif 'html' in content_type:
-            if not filename.lower().endswith(('.html', '.htm')):
-                filename += ".html"
-        # Create a temporary file
-        temp_dir = tempfile.gettempdir()
-        file_path = os.path.join(temp_dir, filename)
-        # Save the file
-        with open(file_path, 'wb') as f:
-            for chunk in response.iter_content(chunk_size=8192):
-                f.write(chunk)
-        return file_path
-    except Exception as e:
-        print(f"Error downloading file: {str(e)}")
-        return None
 # Function to generate a summary using the IBM Granite model
-def generate_summary(chunks: List[Document], model, tokenizer, summary_type="abstractive", detail_level="medium", length="medium"):
-    """Generate a summary from document chunks using the IBM Granite model"""
     # Concatenate the retrieved chunks
     combined_text = " ".join([chunk.page_content for chunk in chunks])
-    # Create a prompt based on the summary parameters
-    if summary_type == "extractive":
-        summary_instruction = "Extract the key sentences from the text to create a summary."
-    else:  # abstractive
-        summary_instruction = "Generate a comprehensive summary in your own words."
-    if detail_level == "high":
-        detail_instruction = "Include specific details and examples."
-    elif detail_level == "medium":
-        detail_instruction = "Balance key points with some supporting details."
-    else:  # low
-        detail_instruction = "Focus only on the main points and key takeaways."
-    if length == "short":
-        length_instruction = "Keep the summary concise and brief."
-    elif length == "medium":
-        length_instruction = "Create a moderate-length summary."
-    else:  # long
-        length_instruction = "Provide a comprehensive, detailed summary."
-    # Construct the full prompt
     prompt = f"""<instruction>
-    You are a document summarization assistant. Based on the following text, {summary_instruction} {detail_instruction} {length_instruction}
-    </instruction>
-    <text>
-    {combined_text}
-    </text>
-    """
     # Generate the summary using the IBM Granite model
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
@@ -190,7 +162,7 @@ def generate_summary(chunks: List[Document], model, tokenizer, summary_type="abs
     with torch.no_grad():
         output = model.generate(
             **inputs,
-            max_new_tokens=1024,
             temperature=0.7,
             top_p=0.9,
             do_sample=True
@@ -204,68 +176,50 @@ def generate_summary(chunks: List[Document], model, tokenizer, summary_type="abs
     return summary.strip()
-# Function to summarize a full document
-def summarize_full_document(retriever, model, tokenizer, summary_params, chunk_size=8):
-    """Summarize an entire document by processing all chunks"""
-    all_chunks = []
-    # Get all documents from the vector store
-    for i in range(0, len(retriever.vectorstore.index_to_docstore_id), chunk_size):
-        batch_ids = list(retriever.vectorstore.index_to_docstore_id.values())[i:i+chunk_size]
-        batch_chunks = [retriever.vectorstore.docstore.search(doc_id) for doc_id in batch_ids]
-        all_chunks.extend(batch_chunks)
-    # Process chunks in manageable batches if needed
-    summaries = []
-    for i in range(0, len(all_chunks), chunk_size):
-        batch = all_chunks[i:i+chunk_size]
-        summary = generate_summary(
-            batch,
-            model,
-            tokenizer,
-            summary_type=summary_params.get("summary_type", "abstractive"),
-            detail_level=summary_params.get("detail_level", "medium"),
-            length=summary_params.get("length", "medium")
         )
-        summaries.append(summary)
-    # Create final summary from batch summaries if needed
-    if len(summaries) > 1:
-        final_summary = generate_summary(
-            [Document(page_content=s) for s in summaries],
-            model,
-            tokenizer,
-            summary_type=summary_params.get("summary_type", "abstractive"),
-            detail_level=summary_params.get("detail_level", "medium"),
-            length=summary_params.get("length", "medium")
         )
-        return final_summary
-    else:
-        return summaries[0] if summaries else "No content to summarize"
 # Main function to process document and generate summary
 @spaces.GPU
 def process_document(
     file_obj: Optional[Union[str, tempfile._TemporaryFileWrapper]] = None,
-    url: Optional[str] = None,
-    summary_type: str = "abstractive",
-    detail_level: str = "medium",
-    length: str = "medium",
     progress=gr.Progress()
 ):
-    """Process a document file or URL and generate a summary"""
     try:
-        # Process input source (file or URL)
-        document_path = None
-        if file_obj:
-            document_path = file_obj.name if hasattr(file_obj, 'name') else str(file_obj)
-        elif url and url.strip():
-            progress(0.2, "Downloading document from URL...")
-            document_path = download_file_from_url(url.strip())
-            if not document_path:
-                return "Failed to download document from URL. Please check the URL and try again."
-        else:
-            return "Please provide either a file or a URL to summarize."
         # Convert document to markdown
         progress(0.3, "Converting document to markdown...")
@@ -278,41 +232,78 @@ def process_document(
         loader = UnstructuredMarkdownLoader(str(markdown_path))
         documents = loader.load()
         text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=500,
-            chunk_overlap=50,
-            length_function=len
         )
         texts = text_splitter.split_documents(documents)
         if not texts:
             return "No text could be extracted from the document."
-        # Create embeddings and vector store
-        progress(0.6, "Creating document embeddings...")
-        embeddings = HuggingFaceEmbeddings(
-            model_name="nomic-ai/nomic-embed-text-v1",
-            model_kwargs={'trust_remote_code': True}
-        )
-        vectorstore = FAISS.from_documents(texts, embeddings)
-        # Create retriever
         retriever = vectorstore.as_retriever(
             search_type="similarity",
-            search_kwargs={"k": 4}
         )
-        # Generate summary
         progress(0.8, "Generating summary...")
-        summary_params = {
-            "summary_type": summary_type,
-            "detail_level": detail_level,
-            "length": length
-        }
-        summary = summarize_full_document(retriever, model, tokenizer, summary_params)
-        progress(1.0, "Summary complete!")
-        return summary
     except Exception as e:
         return f"Error processing document: {str(e)}"
@@ -320,61 +311,90 @@ def process_document(
 # Create Gradio interface
 def create_gradio_interface():
     """Create and launch the Gradio interface"""
-    with gr.Blocks(title="Document Summarizer") as app:
-        gr.Markdown("# Document Summarizer")
-        gr.Markdown("Upload a document or provide a URL to generate a summary.")
         with gr.Row():
-            with gr.Column():
-                file_input = gr.File(label="Upload Document (PDF, DOCX, PPTX, HTML)")
-                url_input = gr.Textbox(label="Or enter document URL")
                 with gr.Row():
-                    with gr.Column():
-                        summary_type = gr.Radio(
-                            choices=["extractive", "abstractive"],
-                            value="abstractive",
-                            label="Summary Type"
-                        )
                 with gr.Row():
-                    with gr.Column():
-                        detail_level = gr.Radio(
-                            choices=["low", "medium", "high"],
-                            value="medium",
-                            label="Level of Detail"
-                        )
-                    with gr.Column():
-                        length = gr.Radio(
-                            choices=["short", "medium", "long"],
-                            value="medium",
-                            label="Summary Length"
-                        )
-                submit_btn = gr.Button("Generate Summary", variant="primary")
-            with gr.Column():
-                output = gr.Textbox(
-                    label="Summary Result",
                     lines=15,
                     max_lines=30
                 )
         submit_btn.click(
-            fn=process_document,
-            inputs=[file_input, url_input, summary_type, detail_level, length],
             outputs=output
         )
         gr.Markdown("""
         ## How to use:
-        1. Upload a document (PDF, DOCX, PPTX, HTML) or provide a URL
-        2. Choose your preferred summary parameters:
-           - Summary Type: Extractive (pulls key sentences) or Abstractive (generates new text)
-           - Level of Detail: Low, Medium, or High
-           - Summary Length: Short, Medium, or Long
-        3. Click "Generate Summary" to process the document
         """)
     return app
@@ -382,4 +402,4 @@ def create_gradio_interface():
 # Launch the application
 if __name__ == "__main__":
     app = create_gradio_interface()
-    app.launch()

 import torch
 import gradio as gr
 from pathlib import Path
+from typing import Optional, List, Union
+import gc
+import time
 # Docling imports
 from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption, SimplePipeline
 # LangChain imports
 # Initialize IBM Granite model and tokenizer
 print("Loading Granite model and tokenizer...")
+model_name = "ibm-granite/granite-3.3-8b-instruct"
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Load model with optimization for GPU
 model = AutoModelForCausalLM.from_pretrained(
+    model_name,
     device_map="auto",
+    torch_dtype=torch.bfloat16,
+    load_in_8bit=True  # Use 8-bit quantization for memory efficiency
 )
 print("Model loaded successfully!")
 # Helper function to detect document format
+def get_document_format(file_path) -> Optional[InputFormat]:
     """Determine the document format based on file extension"""
     try:
         file_path = str(file_path)
             '.html': InputFormat.HTML,
             '.htm': InputFormat.HTML
         }
+        return format_map.get(extension)
     except Exception as e:
+        print(f"Error in get_document_format: {str(e)}")
+        return None
 # Function to convert documents to markdown
 def convert_document_to_markdown(doc_path) -> str:
         # Convert to absolute path string
         input_path = os.path.abspath(str(doc_path))
         print(f"Converting document: {doc_path}")
         # Create temporary directory for processing
         with tempfile.TemporaryDirectory() as temp_dir:
             # Copy input file to temp directory
             temp_input = os.path.join(temp_dir, os.path.basename(input_path))
             shutil.copy2(input_path, temp_input)
             # Configure pipeline options
             pipeline_options = PdfPipelineOptions()
+            pipeline_options.do_ocr = False  # Disable OCR for performance
             pipeline_options.do_table_structure = True
+            # Create converter with optimized options
             converter = DocumentConverter(
                 allowed_formats=[
                     InputFormat.PDF,
                     )
                 }
             )
             # Convert document
             print("Starting conversion...")
             conv_result = converter.convert(temp_input)
             if not conv_result or not conv_result.document:
                 raise ValueError(f"Failed to convert document: {doc_path}")
             # Export to markdown
             print("Exporting to markdown...")
             md = conv_result.document.export_to_markdown()
             # Create output path
             output_dir = os.path.dirname(input_path)
             base_name = os.path.splitext(os.path.basename(input_path))[0]
             md_path = os.path.join(output_dir, f"{base_name}_converted.md")
             # Write markdown file
             with open(md_path, "w", encoding="utf-8") as fp:
                 fp.write(md)
             return md_path
     except Exception as e:
         return f"Error converting document: {str(e)}"
 # Function to generate a summary using the IBM Granite model
+def generate_summary(chunks: List[Document], length_type="sentences", length_count=3):
+    """Generate a summary from document chunks using the IBM Granite model
+    Args:
+        chunks: List of document chunks to summarize
+        length_type: Either "sentences" or "paragraphs"
+        length_count: Number of sentences (1-10) or paragraphs (1-3)
+    """
     # Concatenate the retrieved chunks
     combined_text = " ".join([chunk.page_content for chunk in chunks])
+    # Construct length instruction based on type and count
+    if length_type == "sentences":
+        length_instruction = f"Summarize the following text in {length_count} sentence{'s' if length_count > 1 else ''}."
+    else:  # paragraphs
+        length_instruction = f"Summarize the following text in {length_count} paragraph{'s' if length_count > 1 else ''}."
+    # Construct the prompt
     prompt = f"""<instruction>
+Knowledge Cutoff Date: April 2024. You are Granite, developed by IBM. You are a helpful AI assistant. {length_instruction} Your response should only include the answer. Do not provide any further explanation.
+</instruction>
+<text>
+{combined_text}
+</text>
+"""
+    # Calculate appropriate max_new_tokens based on length requirements
+    # Approximate tokens: ~15 tokens per sentence, ~75 tokens per paragraph
+    if length_type == "sentences":
+        max_tokens = length_count * 20  # Slightly more than needed for flexibility
+    else:  # paragraphs
+        max_tokens = length_count * 100  # Slightly more than needed for flexibility
+    # Ensure minimum tokens and add buffer
+    max_tokens = max(100, min(1000, max_tokens + 50))
     # Generate the summary using the IBM Granite model
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     with torch.no_grad():
         output = model.generate(
             **inputs,
+            max_new_tokens=max_tokens,
             temperature=0.7,
             top_p=0.9,
             do_sample=True
     return summary.strip()
+# Function to process document chunks efficiently
+def process_document_chunks(texts, batch_size=8):
+    """Process document chunks in efficient batches"""
+    try:
+        # Create embeddings with optimized settings
+        embeddings = HuggingFaceEmbeddings(
+            model_name="nomic-ai/nomic-embed-text-v1",
+            model_kwargs={'trust_remote_code': True}
         )
+        # Create vector store more efficiently
+        vectorstore = FAISS.from_documents(
+            texts,
+            embeddings,
+            # Add distance function for better retrieval
+            distance_strategy="cosine"
         )
+        return vectorstore
+    except Exception as e:
+        print(f"Error in document processing: {str(e)}")
+        # Fallback to basic processing if optimization fails
+        return FAISS.from_documents(texts, embeddings)
 # Main function to process document and generate summary
 @spaces.GPU
 def process_document(
     file_obj: Optional[Union[str, tempfile._TemporaryFileWrapper]] = None,
+    length_type: str = "sentences",
+    length_count: int = 3,
     progress=gr.Progress()
 ):
+    """Process a document file and generate a summary"""
     try:
+        # Process input file
+        if not file_obj:
+            return "Please provide a file to summarize."
+        document_path = file_obj.name if hasattr(file_obj, 'name') else str(file_obj)
+        # Validate document format
+        format_type = get_document_format(document_path)
+        if not format_type:
+            return "Unsupported file format. Please upload a PDF, DOCX, PPTX, or HTML file."
         # Convert document to markdown
         progress(0.3, "Converting document to markdown...")
         loader = UnstructuredMarkdownLoader(str(markdown_path))
         documents = loader.load()
+        # Optimize text splitting for better chunks
         text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,  # Larger chunk size for better context
+            chunk_overlap=100,
+            length_function=len,
+            separators=["\n\n", "\n", ".", " ", ""]  # Prioritize splitting at paragraph/sentence boundaries
         )
         texts = text_splitter.split_documents(documents)
         if not texts:
             return "No text could be extracted from the document."
+        # Create vector store with efficient processing
+        progress(0.6, "Processing document content...")
+        vectorstore = process_document_chunks(texts)
+        # Create retriever with optimized settings
         retriever = vectorstore.as_retriever(
             search_type="similarity",
+            search_kwargs={"k": 4}  # Number of chunks to retrieve
         )
+        # Process chunks in smaller batches for memory efficiency
         progress(0.8, "Generating summary...")
+        all_chunks = []
+        batch_size = 4  # Smaller batch size for memory efficiency
+        # Get all document chunks
+        doc_ids = list(vectorstore.index_to_docstore_id.values())
+        # Process in smaller batches
+        for i in range(0, len(doc_ids), batch_size):
+            batch_ids = doc_ids[i:i+batch_size]
+            batch_chunks = [vectorstore.docstore.search(doc_id) for doc_id in batch_ids]
+            all_chunks.extend(batch_chunks)
+            # Force garbage collection to free memory
+            gc.collect()
+            # Sleep briefly to allow memory cleanup
+            time.sleep(0.1)
+        # Generate summary from chunks
+        if len(all_chunks) > 8:
+            # If we have many chunks, process in batches
+            summaries = []
+            for i in range(0, len(all_chunks), batch_size):
+                batch = all_chunks[i:i+batch_size]
+                summary = generate_summary(
+                    batch,
+                    length_type=length_type,
+                    length_count=max(1, length_count // 2)  # Use smaller count for partial summaries
+                )
+                summaries.append(summary)
+                # Force garbage collection
+                gc.collect()
+            # Create final summary from batch summaries
+            final_summary = generate_summary(
+                [Document(page_content=s) for s in summaries],
+                length_type=length_type,
+                length_count=length_count
+            )
+            return final_summary
+        else:
+            # If we have few chunks, generate summary directly
+            return generate_summary(
+                all_chunks,
+                length_type=length_type,
+                length_count=length_count
+            )
     except Exception as e:
         return f"Error processing document: {str(e)}"
 # Create Gradio interface
 def create_gradio_interface():
     """Create and launch the Gradio interface"""
+    with gr.Blocks(title="Granite Document Summarization") as app:
+        gr.Markdown("# Granite Document Summarization")
+        gr.Markdown("Upload a document to generate a summary.")
         with gr.Row():
+            with gr.Column(scale=1):
+                file_input = gr.File(
+                    label="Upload Document (PDF, DOCX, PPTX, HTML)",
+                    file_types=[".pdf", ".docx", ".doc", ".pptx", ".html", ".htm"]
+                )
                 with gr.Row():
+                    length_type = gr.Radio(
+                        choices=["Sentences", "Paragraphs"],
+                        value="Sentences",
+                        label="Summary Length Type"
+                    )
                 with gr.Row():
+                    # Use slider for sentence count (1-10)
+                    sentence_count = gr.Slider(
+                        minimum=1,
+                        maximum=10,
+                        value=3,
+                        step=1,
+                        label="Number of Sentences",
+                        visible=True
+                    )
+                    # Use radio for paragraph count (1-3)
+                    paragraph_count = gr.Radio(
+                        choices=["1", "2", "3"],
+                        value="1",
+                        label="Number of Paragraphs",
+                        visible=False
+                    )
+                submit_btn = gr.Button("Summarize", variant="primary")
+            with gr.Column(scale=2):
+                output = gr.TextArea(
+                    label="Summary",
                     lines=15,
                     max_lines=30
                 )
+        # Add interactivity to show/hide appropriate count selector
+        def update_count_visibility(length_type):
+            return {
+                sentence_count: length_type == "Sentences",
+                paragraph_count: length_type == "Paragraphs"
+            }
+        length_type.change(
+            fn=update_count_visibility,
+            inputs=[length_type],
+            outputs=[sentence_count, paragraph_count]
+        )
+        # Function to convert paragraph count from string to int and handle capitalized length types
+        def process_document_wrapper(file, length_type, sentence_count, paragraph_count):
+            # Convert capitalized length_type to lowercase for processing
+            length_type_lower = length_type.lower()
+            if length_type_lower == "sentences":
+                return process_document(file, length_type_lower, int(sentence_count))
+            else:
+                return process_document(file, length_type_lower, int(paragraph_count))
         submit_btn.click(
+            fn=process_document_wrapper,
+            inputs=[file_input, length_type, sentence_count, paragraph_count],
             outputs=output
         )
         gr.Markdown("""
         ## How to use:
+        1. Upload a document (PDF, DOCX, PPTX, HTML)
+        2. Choose your summary length preference:
+           - Number of Sentences (1-10)
+           - Number of Paragraphs (1-3)
+        3. Click "Summarize" to process the document
+        *This application uses the IBM Granite 3.3-8b model to generate summaries.*
         """)
     return app
 # Launch the application
 if __name__ == "__main__":
     app = create_gradio_interface()
+    app.launch()