Spaces:

ikraamkb
/

Summarization

Running

File size: 7,860 Bytes

from fastapi import FastAPI, UploadFile, File, Form, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import os
import tempfile
from gtts import gTTS
from fpdf import FPDF
import datetime
import fitz  # PyMuPDF
import docx
import pptx
import openpyxl
import re
import nltk
from nltk.tokenize import sent_tokenize
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import easyocr
import shutil
import hashlib

nltk.download('punkt', quiet=True)

app = FastAPI()

# CORS Configuration
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Initialize models
MODEL_NAME = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.eval()
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)

reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
summary_cache = {}

def clean_text(text: str) -> str:
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
    text = re.sub(r'\[.*?\]|\(.*?\)', '', text)
    text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)
    return text.strip()

def extract_text(file_path: str, file_extension: str):
    try:
        if file_extension == "pdf":
            with fitz.open(file_path) as doc:
                text = "\n".join(page.get_text("text") for page in doc)
                if len(text.strip()) < 50:
                    images = [page.get_pixmap() for page in doc]
                    temp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
                    images[0].save(temp_img.name)
                    ocr_result = reader.readtext(temp_img.name, detail=0)
                    os.unlink(temp_img.name)
                    text = "\n".join(ocr_result) if ocr_result else text
                return clean_text(text), ""

        elif file_extension == "docx":
            doc = docx.Document(file_path)
            return clean_text("\n".join(p.text for p in doc.paragraphs), ""

        elif file_extension == "pptx":
            prs = pptx.Presentation(file_path)
            text = [shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")]
            return clean_text("\n".join(text)), ""

        elif file_extension == "xlsx":
            wb = openpyxl.load_workbook(file_path, read_only=True)
            text = [" ".join(str(cell) for cell in row if cell) for sheet in wb.sheetnames for row in wb[sheet].iter_rows(values_only=True)]
            return clean_text("\n".join(text)), ""

        return "", "Unsupported file format"
    except Exception as e:
        return "", f"Error reading {file_extension.upper()} file: {str(e)}"

def chunk_text(text: str, max_tokens: int = 950):
    try:
        sentences = sent_tokenize(text)
    except:
        words = text.split()
        sentences = [' '.join(words[i:i+20]) for i in range(0, len(words), 20]

    chunks = []
    current_chunk = ""
    for sentence in sentences:
        token_length = len(tokenizer.encode(current_chunk + " " + sentence))
        if token_length <= max_tokens:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def generate_summary(text: str, length: str = "medium") -> str:
    cache_key = hashlib.md5((text + length).encode()).hexdigest()
    if cache_key in summary_cache:
        return summary_cache[cache_key]

    length_params = {
        "short": {"max_length": 80, "min_length": 30},
        "medium": {"max_length": 200, "min_length": 80},
        "long": {"max_length": 300, "min_length": 210}
    }
    chunks = chunk_text(text)
    try:
        summaries = summarizer(
            chunks,
            max_length=length_params[length]["max_length"],
            min_length=length_params[length]["min_length"],
            do_sample=False,
            truncation=True,
            no_repeat_ngram_size=2,
            num_beams=2,
            early_stopping=True
        )
        summary_texts = [s['summary_text'] for s in summaries]
    except Exception as e:
        summary_texts = [f"[Batch error: {str(e)}]"]

    final_summary = " ".join(summary_texts)
    final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
    final_summary = final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"

    summary_cache[cache_key] = final_summary
    return final_summary

def text_to_speech(text: str):
    try:
        tts = gTTS(text)
        temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
        tts.save(temp_audio.name)
        return temp_audio.name
    except Exception as e:
        print(f"Error in text-to-speech: {e}")
        return ""

def create_pdf(summary: str, original_filename: str):
    try:
        pdf = FPDF()
        pdf.add_page()
        pdf.set_font("Arial", size=12)
        pdf.set_font("Arial", 'B', 16)
        pdf.cell(200, 10, txt="Document Summary", ln=1, align='C')
        pdf.set_font("Arial", size=12)
        pdf.cell(200, 10, txt=f"Original file: {original_filename}", ln=1)
        pdf.cell(200, 10, txt=f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=1)
        pdf.ln(10)
        pdf.multi_cell(0, 10, txt=summary)
        temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
        pdf.output(temp_pdf.name)
        return temp_pdf.name
    except Exception as e:
        print(f"Error creating PDF: {e}")
        return ""

@app.post("/summarize/")
async def summarize_api(file: UploadFile = File(...), length: str = Form("medium")):
    # Validate file type
    valid_types = [
        'application/pdf',
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
        'application/vnd.openxmlformats-officedocument.presentationml.presentation',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
    ]
    
    if file.content_type not in valid_types:
        raise HTTPException(
            status_code=400,
            detail="Please upload a valid document (PDF, DOCX, PPTX, or XLSX)"
        )

    try:
        # Save temp file
        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp:
            shutil.copyfileobj(file.file, temp)
            temp_path = temp.name

        # Process file
        text, error = extract_text(temp_path, os.path.splitext(file.filename)[1][1:].lower())
        if error:
            raise HTTPException(status_code=400, detail=error)

        summary = generate_summary(text, length)
        audio_path = text_to_speech(summary)
        pdf_path = create_pdf(summary, file.filename)

        return {
            "summary": summary,
            "audio_url": f"/files/{os.path.basename(audio_path)}" if audio_path else None,
            "pdf_url": f"/files/{os.path.basename(pdf_path)}" if pdf_path else None
        }

    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(
            status_code=500,
            detail=f"Summarization failed: {str(e)}"
        )
    finally:
        if 'temp_path' in locals() and os.path.exists(temp_path):
            os.unlink(temp_path)

@app.get("/files/{filename}")
async def get_file(filename: str):
    file_path = os.path.join(tempfile.gettempdir(), filename)
    if os.path.exists(file_path):
        return FileResponse(file_path)
    raise HTTPException(status_code=404, detail="File not found")